From d02d5edfa15ba6c04a9986a8a362a945cb38ac31 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 22 Jan 2021 09:47:04 +0000 Subject: Integrate improved CPU depthwise convolution kernels * Replace assembly kernels for depthwise convolution with more optimized ones. * Add int8 assembly kernels. * Fix implicit padding on optimized kernels Resolves: COMPMID-3867, COMPMID-4361 Change-Id: I0b0867e05f61be4f368f62190d55e14d0ab3ebf2 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5622 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- Android.bp | 115 +- SConscript | 17 +- arm_compute/core/utils/quantization/AsymmHelpers.h | 4 +- .../NEON/functions/NEDepthwiseConvolutionLayer.h | 2 +- ...aped_rhs_only_fused_output_stage_fixedpoint.cpp | 2 - .../arm_conv/depthwise/depthwise_depthfirst.hpp | 347 ++ .../depthwise/depthwise_depthfirst_generic.hpp | 388 ++ .../depthwise_depthfirst_generic_multiplier.hpp | 480 ++ ...ise_depthfirst_generic_multiplier_quantized.hpp | 127 + .../depthwise_depthfirst_generic_quantized.hpp | 125 + .../depthwise/depthwise_depthfirst_multiplier.hpp | 527 ++ .../depthwise_depthfirst_multiplier_quantized.hpp | 251 + .../depthwise/depthwise_depthfirst_quantized.hpp | 412 ++ .../kernels/arm_conv/depthwise/depthwise_fp16.cpp | 224 + .../kernels/arm_conv/depthwise/depthwise_fp32.cpp | 276 + .../depthwise/depthwise_implementation.hpp | 143 + .../depthwise_implementation_constraints.hpp | 105 + .../kernels/arm_conv/depthwise/depthwise_s8q.cpp | 266 + .../kernels/arm_conv/depthwise/depthwise_u8q.cpp | 228 + .../arm_conv/depthwise/depthwise_u8s8u8q.cpp | 157 + .../arm_conv/depthwise/interleaves/8b_mla.cpp | 128 + .../depthwise/interleaves/a64_s8q_3x3_dot.cpp | 250 + .../depthwise/interleaves/a64_u8q_3x3_dot.cpp | 250 + .../arm_conv/depthwise/interleaves/list.hpp | 119 + .../depthwise/interleaves/sve_s8q_3x3_dot.cpp | 136 + .../depthwise/interleaves/sve_u8q_3x3_dot.cpp | 136 + ...4_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 528 ++ .../generic_indirect.cpp | 515 ++ ...4_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 829 +++ .../generic_indirect.cpp | 907 +++ ...4_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 1233 ++++ .../generic_indirect.cpp | 1399 +++++ ...4_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 616 ++ .../generic_indirect.cpp | 631 ++ ...4_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 973 ++++ .../generic_indirect.cpp | 1022 ++++ ...64_fp16_nhwc_generic_output9_mla_depthfirst.hpp | 59 + .../generic.cpp | 527 ++ ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 62 + .../generic.cpp | 1049 ++++ ...4_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 68 + .../generic_direct.cpp | 524 ++ .../generic_indirect.cpp | 511 ++ ...4_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 68 + .../generic_direct.cpp | 825 +++ .../generic_indirect.cpp | 903 +++ ...4_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 68 + .../generic_direct.cpp | 1229 ++++ .../generic_indirect.cpp | 1395 +++++ ...4_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 68 + .../generic_direct.cpp | 612 ++ .../generic_indirect.cpp | 627 ++ ...4_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 68 + .../generic_direct.cpp | 969 ++++ .../generic_indirect.cpp | 1018 ++++ ...64_fp32_nhwc_generic_output9_mla_depthfirst.hpp | 55 + .../generic.cpp | 379 ++ ...s2_with_multiplier_output3x3_mla_depthfirst.hpp | 66 + .../generic.cpp | 532 ++ ...s1_with_multiplier_output2x4_mla_depthfirst.hpp | 66 + .../generic.cpp | 916 +++ ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 58 + .../generic.cpp | 851 +++ ...64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 75 + .../generic.cpp | 1318 +++++ ...64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 1192 ++++ ...64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 1423 +++++ ...64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 2213 +++++++ ...a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp | 55 + .../generic.cpp | 624 ++ ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 66 + .../generic.cpp | 527 ++ ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 66 + .../generic.cpp | 662 +++ ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 58 + .../generic.cpp | 1484 +++++ ...4_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 75 + .../generic.cpp | 1184 ++++ ...64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 75 + .../generic.cpp | 1318 +++++ ...64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 1192 ++++ ...64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 1423 +++++ ...64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 2213 +++++++ ...a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp | 55 + .../generic.cpp | 624 ++ ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 66 + .../generic.cpp | 527 ++ ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 66 + .../generic.cpp | 662 +++ ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 58 + .../generic.cpp | 1484 +++++ ...8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 1192 ++++ ...8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 1423 +++++ ...8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 2213 +++++++ ...u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp | 55 + .../generic.cpp | 624 ++ ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 58 + .../generic.cpp | 1484 +++++ ...e_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 324 ++ .../generic_indirect.cpp | 284 + ...e_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 478 ++ .../generic_indirect.cpp | 495 ++ ...e_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 688 +++ .../generic_indirect.cpp | 746 +++ ...e_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 345 ++ .../generic_indirect.cpp | 345 ++ ...e_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 531 ++ .../generic_indirect.cpp | 559 ++ ...e_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 72 + .../generic.cpp | 255 + .../generic_direct.cpp | 364 ++ .../generic_indirect.cpp | 318 ++ ...hwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp | 66 + .../generic.cpp | 247 + ...e_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 538 ++ .../generic_indirect.cpp | 547 ++ ...e_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 688 +++ .../generic_indirect.cpp | 820 +++ ...e_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 405 ++ .../generic_indirect.cpp | 397 ++ ...e_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 72 + .../generic_direct.cpp | 531 ++ .../generic_indirect.cpp | 633 ++ ...ve_fp32_nhwc_generic_output9_mla_depthfirst.hpp | 59 + .../generic.cpp | 166 + ...s2_with_multiplier_output3x3_mla_depthfirst.hpp | 70 + .../generic.cpp | 259 + ...s1_with_multiplier_output2x4_mla_depthfirst.hpp | 70 + .../generic.cpp | 392 ++ ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 62 + .../generic.cpp | 454 ++ ...ve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 75 + .../generic.cpp | 457 ++ ...ve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 418 ++ ...ve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 459 ++ ...ve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 660 +++ ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 70 + .../generic.cpp | 353 ++ ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 70 + .../generic.cpp | 428 ++ ...e_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 75 + .../generic.cpp | 388 ++ ...ve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 75 + .../generic.cpp | 457 ++ ...ve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 418 ++ ...ve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 459 ++ ...ve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 660 +++ ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 70 + .../generic.cpp | 353 ++ ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 70 + .../generic.cpp | 428 ++ ...8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 418 ++ ...8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 459 ++ ...8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 75 + .../generic.cpp | 660 +++ src/core/NEON/kernels/arm_gemm/utils.hpp | 12 + .../NEDepthwiseConvolutionAssemblyKernelWrapper.h | 88 - src/core/NEON/kernels/assembly/common.hpp | 34 + src/core/NEON/kernels/assembly/depthwise.hpp | 170 + .../NEON/kernels/assembly/depthwise_common.hpp | 131 + src/core/NEON/kernels/assembly/pool_common.hpp | 9 +- .../kernels/convolution/depthwise/depthwise.hpp | 551 -- .../depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp | 1168 ---- .../depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp | 2809 --------- .../depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp | 2341 -------- .../depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp | 769 --- .../depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp | 6018 -------------------- .../convolution/depthwise/depthwise_dilated.cpp | 42 - .../convolution/depthwise/depthwise_dilated.hpp | 156 - .../depthwise/depthwise_dilated_qa8_qa8.cpp | 144 - .../convolution/depthwise/depthwise_fp16.cpp | 34 - .../convolution/depthwise/depthwise_fp32.cpp | 31 - .../depthwise/depthwise_pack_parameters.cpp | 102 - .../convolution/depthwise/depthwise_qa8_qa8.cpp | 32 - .../convolution/depthwise/depthwise_qs8_qs8.cpp | 31 - .../convolution/depthwise/depthwise_quantized.hpp | 291 - .../depthwise/depthwise_quantized_dilated.hpp | 88 - .../kernels/convolution/depthwise/impl_base.hpp | 505 -- .../kernels/convolution/depthwise/impl_dilated.hpp | 295 - .../convolution/depthwise/impl_fp16_fp16.hpp | 439 -- .../convolution/depthwise/impl_fp32_fp32.hpp | 438 -- .../kernels/convolution/depthwise/impl_qa8_qa8.hpp | 511 -- .../depthwise/impl_qa8_qs8_per_channel.hpp | 457 -- .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp | 33 +- .../CpuDepthwiseConv2dAssemblyWrapperKernel.cpp | 359 ++ .../CpuDepthwiseConv2dAssemblyWrapperKernel.h | 120 + .../internal/CpuPool2dAssemblyWrapperKernel.cpp | 7 +- src/core/utils/AssemblyUtils.cpp | 70 + src/core/utils/AssemblyUtils.h | 52 + src/core/utils/quantization/AsymmHelpers.cpp | 7 +- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 4 - .../CL/functions/CLGEMMConvolutionLayer.cpp | 2 - .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 7 +- src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp | 16 +- src/runtime/cpu/operators/CpuDepthwiseConv2d.h | 5 +- .../CpuDepthwiseConv2dAssemblyDispatch.cpp | 520 +- .../operators/CpuDepthwiseConv2dAssemblyDispatch.h | 27 +- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 37 +- tests/datasets/DepthwiseConvolutionLayerDataset.h | 8 +- .../fixtures/DepthwiseConvolutionLayerFixture.h | 3 +- 230 files changed, 78859 insertions(+), 17935 deletions(-) create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp delete mode 100644 src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h create mode 100644 src/core/NEON/kernels/assembly/common.hpp create mode 100644 src/core/NEON/kernels/assembly/depthwise.hpp create mode 100644 src/core/NEON/kernels/assembly/depthwise_common.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_base.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp delete mode 100644 src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp create mode 100644 src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp create mode 100644 src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h create mode 100644 src/core/utils/AssemblyUtils.cpp create mode 100644 src/core/utils/AssemblyUtils.h diff --git a/Android.bp b/Android.bp index 87bdcfcccd..19645c0c26 100644 --- a/Android.bp +++ b/Android.bp @@ -48,7 +48,6 @@ cc_library_static { "src/core/helpers", "src/core/NEON/kernels/assembly", "src/core/NEON/kernels/convolution/common", - "src/core/NEON/kernels/convolution/depthwise", "src/core/NEON/kernels/convolution/winograd", "src/core/cpu/kernels/assembly"], export_include_dirs: [".", "./include"], @@ -206,6 +205,12 @@ cc_library_static { "src/core/NEON/kernels/NETileKernel.cpp", "src/core/NEON/kernels/NEWeightsReshapeKernel.cpp", "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp", @@ -236,18 +241,6 @@ cc_library_static { "src/core/NEON/kernels/convolution/common/qasymm8.cpp", "src/core/NEON/kernels/convolution/common/qsymm8.cpp", "src/core/NEON/kernels/convolution/common/utils.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp", - "src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp", "src/core/NEON/kernels/convolution/winograd/padding.cpp", "src/core/NEON/kernels/convolution/winograd/winograd.cpp", "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp", @@ -325,6 +318,7 @@ cc_library_static { "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp", "src/core/cpu/kernels/floor/neon/fp16.cpp", "src/core/cpu/kernels/floor/neon/fp32.cpp", + "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp", "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp", "src/core/cpu/kernels/pooling/neon/fp16.cpp", "src/core/cpu/kernels/pooling/neon/fp32.cpp", @@ -392,6 +386,7 @@ cc_library_static { "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp", "src/core/helpers/SoftmaxHelpers.cpp", "src/core/helpers/WindowHelpers.cpp", + "src/core/utils/AssemblyUtils.cpp", "src/core/utils/ScaleUtils.cpp", "src/core/utils/helpers/fft.cpp", "src/core/utils/helpers/tensor_transform.cpp", @@ -703,6 +698,100 @@ cc_library_static { }, arm64: { srcs: [ + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", diff --git a/SConscript b/SConscript index 94ba6d423f..3e834e347c 100644 --- a/SConscript +++ b/SConscript @@ -1,4 +1,4 @@ -# Copyright (c) 2016, 2017 Arm Limited. +# Copyright (c) 2016-2021 Arm Limited. # # SPDX-License-Identifier: MIT # @@ -252,12 +252,8 @@ core_files_sve = [] if env['neon']: core_files += Glob('src/core/NEON/*.cpp') core_files += Glob('src/core/NEON/kernels/*.cpp') - core_files += Glob('src/core/NEON/kernels/assembly/*.cpp') core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp') - core_files += Glob('src/core/NEON/kernels/arm_conv/*.cpp') - core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/*.cpp') - core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_*/*.cpp') # build winograd/depthwise sources for either v7a / v8a core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp') @@ -275,11 +271,22 @@ if env['neon']: core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a32_*/*.cpp') if env['estate'] == '64': + core_files += Glob('src/core/NEON/kernels/assembly/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/kernels/cpp_*/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_*/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_*/*.cpp') core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/a64_*/*.cpp') if "sve" in env['arch'] or env['fat_binary']: core_files_sve += filelist['cpu']['core']['sve']['all'] core_files_sve += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_*.cpp') + core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_*/*.cpp') core_files_sve += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp') if any(i in env['data_layout_support'] for i in ['all', 'nchw']): diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h index cbf7559bc9..c9d0930c3a 100644 --- a/arm_compute/core/utils/quantization/AsymmHelpers.h +++ b/arm_compute/core/utils/quantization/AsymmHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -89,7 +89,6 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty * @param[in] input Input tensor info. * @param[in] weights Weights tensor info. * @param[in] output Output tensor info. - * @param[in] idx_ofms Dimension index to get OFMs from the weights tensor. * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers. * @param[out] output_shifts_ptr Pointer to the buffer where to store per-channel shifts. * @@ -98,7 +97,6 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, - unsigned int idx_ofms, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr); diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 2f541758f4..6f2ec8cddb 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -112,7 +112,7 @@ private: * * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present + * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required * -# @ref NEActivationLayer if fused activation is required * diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp index d8f8f1498a..ca7b7a5f04 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp @@ -236,7 +236,6 @@ public: gemmlowp_output_stage.output_data_type = dst.info()->data_type(); gemmlowp_output_stage.gemmlowp_offset = 0; { - const int idx_kernels = get_data_layout_dimension_index(lhs.info()->data_layout(), DataLayoutDimension::BATCHES); gemmlowp_output_stage.is_quantized_per_channel = false; // Num_filters is 1 unless quantized type is of per_channel type. Could be extended in the future to support per-channel quantization. const unsigned int num_filters = 1; @@ -249,7 +248,6 @@ public: quantization::compute_quantized_multipliers_and_shifts(lhs.info(), rhs.info(), dst.info(), - idx_kernels, gemmlowp_output_stage.gemmlowp_multipliers.data(), gemmlowp_output_stage.gemmlowp_shifts.data()); gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp new file mode 100644 index 0000000000..fe635d6fad --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_conv { +namespace depthwise { + +template +class DepthwiseDepthfirst : public DepthwiseCommon +{ + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + using TAccum = typename strategy::bias_type; + + size_t sizeof_input_buffer(unsigned int n_input_channels) const + { + return sizeof(TInput) * n_input_channels; + } + + size_t sizeof_output_buffer(unsigned int n_output_channels) const + { + return sizeof(TOutput) * n_output_channels; + } + + public: + + DepthwiseDepthfirst(const DepthwiseArgs &args) : DepthwiseCommon(args) + { + } + + DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete; + DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete; + + size_t get_storage_size(void) const override + { + // TODO What if we insert extra padding? Biases are a different size to the inputs, ... + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl); + return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight); + } + + void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + { + // TODO What if the kernel needs a different packing function? + + // Cast the pointers + uint8_t *buffer = static_cast(_buffer); + const TAccum *biases = static_cast(_biases); + const TWeight *const weights = static_cast(_weights); + + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; + + for (unsigned int n = 0; n < this->m_args.input_channels; n += vl) + { + const unsigned int todo = std::min(vl, this->m_args.input_channels - n); + + // Copy across the correct amount of bias (or 0) + for (unsigned int i = 0; i < todo; i++) + { + reinterpret_cast(buffer)[i] = (biases == nullptr) ? 0 : biases[n + i]; + } + buffer += vl * sizeof(TAccum); + + // Copy each of the weights in turn + auto weights_row = weights + n; + for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) + { + auto weights_col = weights_row; + + for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) + { + for (unsigned int m = 0; m < todo; m++) + { + reinterpret_cast(buffer)[m] = weights_col[m]; + } + buffer += vl * sizeof(TWeight); + + weights_col += ld_weight_col; + } + + weights_row += ld_weight_row; + } + } + } + + size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + { + const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; + return n_threads * (sizeof_output_buffer(n_output_channels) + sizeof_input_buffer(n_channels)); + } + + using DepthwiseCommon::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Compute activation values + TAccum activation_min, activation_max; + if (std::numeric_limits::is_integer) + { + activation_min = std::numeric_limits::min(); + activation_max = std::numeric_limits::max(); + } + else + { + activation_min = static_cast(-std::numeric_limits::infinity()); + activation_max = static_cast(std::numeric_limits::infinity()); + } + + switch (this->m_args.activation.type) + { + case arm_gemm::Activation::Type::BoundedReLU: + activation_max = static_cast(this->m_args.activation.param1); + // Fall through + case arm_gemm::Activation::Type::ReLU: + activation_min = static_cast(0); + break; + default: + break; + } + + // Determine what portion of the work to do. + const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); + const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); + const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); + + // Cast input and output pointers into the right types + const TInput *const inptr = static_cast(_input); + TOutput *const outptr = static_cast(_output); + + // Create an array for the input pointers + const TInput * _inptr_array[strategy::input_rows * strategy::input_cols]; + const TInput **const inptr_array = _inptr_array; + + // Create an array for the output pointers + TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; + TOutput **const outptr_array = _outptr_array; + + // Allocate portions of the working space + uint8_t *const working_space = static_cast(_working_space) + get_working_size(thread_id, input_channels); + TOutput *const output_buffer = reinterpret_cast(working_space); + TInput *const input_buffer = reinterpret_cast(working_space + sizeof_output_buffer(input_channels * this->m_args.channel_multiplier)); + + // Initialise the input buffer + for (unsigned int c = 0; c < input_channels; c++) + { + input_buffer[c] = static_cast(0); + } + + // For each output tile, construct the requisite set of pointers and call + // into the kernel. + for (unsigned int batch = 0; batch < batches; batch++) + { + // Get batch pointers + const auto inptr_batch = inptr + batch * ld_input_batch; + const auto outptr_batch = outptr + batch * ld_output_batch; + + for (int start_out_i = start_out_height; + start_out_i < end_out_height; + start_out_i += static_cast(strategy::output_rows)) + { + const int end_out_i = start_out_i + strategy::output_rows; + const int start_in_i = start_out_i * strategy::stride_rows - padding.top; + const int end_in_i = start_in_i + strategy::input_rows; + + // Compute top/bottom padding + const auto pad_top = static_cast(-std::min(start_in_i, 0)); + const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); + const unsigned int valid_output_rows = std::min( + end_out_i - start_out_i, + static_cast(output_height) - start_out_i + ); + + // Fill the input pointer array with padding values + for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++) + { + inptr_array[index] = input_buffer; + } + + for (int start_out_j = 0; start_out_j < static_cast(output_width);) + { + const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left; + const int pad_left = -std::min(0, start_in_j); + + // Compute how many output tiles we can compute with the direct kernel. + int n_direct_tiles = 0; + if (!pad_top && !pad_bottom && !pad_left) + { + // Determine the maximum number of tiles we could handle. + n_direct_tiles = (output_width - start_out_j) / strategy::output_cols; + + // Continue to reduce this number as required to avoid reading + // padding on the right edge. + int end_in_j = start_in_j + n_direct_tiles * strategy::input_cols; + int pad_right = std::max(0, end_in_j - static_cast(input_width)); + + while (pad_right && n_direct_tiles) + { + n_direct_tiles--; + end_in_j -= strategy::input_cols; + pad_right = std::max(0, end_in_j - static_cast(input_width)); + } + } + + // Use the unpadded kernel if we can, otherwise use the padded one. + if (n_direct_tiles) + { + auto inptr = inptr_batch + start_in_i*ld_input_row + start_in_j*ld_input_col; + auto outptr = outptr_batch + start_out_i*ld_output_row + start_out_j*ld_output_col; + start_out_j += n_direct_tiles*strategy::output_cols; + +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, 0); +#endif + strat.direct_kernel(1, n_direct_tiles, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + parameters, this->m_args.input_channels, + activation_min, activation_max); + continue; + } + + const int end_out_j = start_out_j + strategy::output_cols; + const int end_in_j = start_in_j + strategy::input_cols; + + const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); + const unsigned int valid_output_cols = std::min( + end_out_j - start_out_j, + static_cast(output_width) - start_out_j + ); + + // Construct the input pointer array - fill the array with pointers to + // the input buffer and then fill in the required values. + for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++) + { + // Can skip over the left padding because we will have either the + // same or less than the previous tile. + unsigned int j = pad_left; + const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col; + const TInput **ptrs = inptr_array + i * strategy::input_cols + j; + for (; j < strategy::input_cols - pad_right; j++) + { + *(ptrs++) = colptr; + colptr += ld_input_col; + } + for (; j < strategy::input_cols; j++) + { + *(ptrs++) = input_buffer; + } + } + + // Construct the output pointer array. + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < valid_output_rows; i++) + { + unsigned int j = 0u; + TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; + for (; j < valid_output_cols; j++) + { + *(outptr_pos++) = colptr; + colptr += ld_output_col; + } + for (; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + for (auto i = valid_output_rows; i < strategy::output_rows; i++) + { + for (auto j = 0u; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + + start_out_j += strategy::output_cols; + +#ifdef CYCLE_PROFILING + // TODO Work number + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(0)); +#endif + strat.indirect_kernel(inptr_array, outptr_array, parameters, + this->m_args.input_channels, activation_min, activation_max); + } + } + } + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp new file mode 100644 index 0000000000..29f37c5697 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_conv { +namespace depthwise { + +template +class DepthwiseDepthfirstGenericBase : + public DepthwiseCommon +{ + protected: + + using TInput = typename Strategy::input_type; + using TWeight = typename Strategy::weight_type; + using TOutput = typename Strategy::return_type; + using TAccum = typename Strategy::bias_type; + + size_t sizeof_input_ptr_array(void) const + { + return sizeof(TInput *) * this->m_args.kernel_rows * this->m_args.kernel_cols * Strategy::n_output_points; + } + + size_t sizeof_input_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(TInput) * rounded_channels; + } + + size_t sizeof_output_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(TOutput) * rounded_channels; + } + + unsigned int input_rows(void) const + { + return this->m_args.kernel_rows + (OutputRows - 1)*this->m_args.stride_rows; + } + + unsigned int input_cols(void) const + { + return this->m_args.kernel_cols + (OutputCols - 1)*this->m_args.stride_cols; + } + + void execute_tiles( + std::function tile_fn, + std::function initialise_input_buffer, + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const + { + static_assert(OutputRows * OutputCols <= Strategy::n_output_points, + "Too many output points for kernel."); + + // Determine what portion of the work to do. + const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); + const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); + const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); + + // Cast input and output pointers into the right types + const TInput *const inptr = static_cast(_input); + TOutput *const outptr = static_cast(_output); + + // Allocate portions of the working space + uint8_t *const working_space = static_cast(_working_space) + this->get_working_size(thread_id, input_channels); + const TInput **const inptr_array = reinterpret_cast(working_space); + TOutput *const output_buffer = reinterpret_cast(working_space + this->sizeof_input_ptr_array()); + TInput *const input_buffer = reinterpret_cast(working_space + this->sizeof_input_ptr_array() + this->sizeof_output_buffer(input_channels * this->m_args.channel_multiplier)); + + // Create an array for the output pointers + TOutput * _outptr_array[Strategy::n_output_points]; + TOutput **const outptr_array = _outptr_array; + + // Initialise the input buffer + initialise_input_buffer(input_buffer, input_channels); + + // For each output tile, construct the requisite set of pointers and call + // into the kernel. + for (unsigned int batch = 0; batch < batches; batch++) + { + // Get batch pointers + const auto inptr_batch = inptr + batch * ld_input_batch; + const auto outptr_batch = outptr + batch * ld_output_batch; + + for (int start_out_i = start_out_height; + start_out_i < end_out_height; + start_out_i += static_cast(OutputRows)) + { + const int end_out_i = std::min(start_out_i + OutputRows, + output_height); + + for (int start_out_j = 0; + start_out_j < static_cast(output_width); + start_out_j += static_cast(OutputCols)) + { + const int end_out_j = std::min(start_out_j + OutputCols, + output_width); + + // Fill the pointer arrays with pointers to the input/output buffers. + for (auto index = 0u; + index < (Strategy::n_output_points * this->m_args.kernel_rows * this->m_args.kernel_cols); + index++) + { + inptr_array[index] = input_buffer; + } + for (auto index = 0u; index < Strategy::n_output_points; index++) + { + outptr_array[index] = output_buffer; + } + + // Construct the pointer arrays together. Note that the input pointer + // array is striped. Since the array has already been filled with + // pointers to the padding array we merely fill in the valid points + // as we get to them. + unsigned int output_index = 0; + auto outptr_row = outptr_batch + start_out_i * ld_output_row + start_out_j * ld_output_col; + for (auto out_i = start_out_i; out_i < end_out_i; out_i++) + { + auto outptr_col = outptr_row; + + // Compute the padding for this row of tiles. + const int start_in_i = out_i * this->m_args.stride_rows - padding.top; + const int end_in_i = start_in_i + this->m_args.kernel_rows; + const auto pad_top = static_cast(std::max(0, 0 - start_in_i)); + const auto pad_bottom = static_cast(std::max(0, end_in_i - input_height)); + const unsigned int valid_rows = this->m_args.kernel_rows - pad_top - pad_bottom; + + for (auto out_j = start_out_j; out_j < end_out_j; out_j++, output_index++) + { + // Compute the output pointer. + outptr_array[output_index] = outptr_col; + outptr_col += ld_output_col; + + // Compute the padding for this tile. + const int start_in_j = out_j * this->m_args.stride_cols - padding.left; + const int end_in_j = start_in_j + this->m_args.kernel_cols; + const auto pad_left = static_cast(std::max(0, 0 - start_in_j)); + const auto pad_right = static_cast(std::max(0, end_in_j - input_width)); + const unsigned int valid_cols = this->m_args.kernel_cols - pad_left - pad_right; + + // Hence compute the input pointers. + auto input_index = output_index + Strategy::n_output_points * (pad_top * this->m_args.kernel_cols + pad_left); + auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row + (start_in_j + pad_left) * ld_input_col; + for (auto in_i = 0u; in_i < valid_rows; in_i++) + { + auto inptr_col = inptr_row; + auto input_index_col = input_index; + + for (auto in_j = 0u; in_j < valid_cols; in_j++) + { + inptr_array[input_index_col] = inptr_col; + inptr_col += ld_input_col; + input_index_col += Strategy::n_output_points; + } + + inptr_row += ld_input_row; + input_index += Strategy::n_output_points * this->m_args.kernel_cols; + } + } + + outptr_row += ld_output_row; + } + + tile_fn(inptr_array, outptr_array); + } + } + } + } + + public: + DepthwiseDepthfirstGenericBase(const DepthwiseArgs &args) : DepthwiseCommon(args) + { + } + + DepthwiseDepthfirstGenericBase(DepthwiseDepthfirstGenericBase &) = delete; + DepthwiseDepthfirstGenericBase &operator=(DepthwiseDepthfirstGenericBase &) = delete; + + size_t get_storage_size(void) const override + { + const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl); + return (this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight); + } + + void pack_parameters(void *_buffer, const void *, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + { + // Cast the pointers + TWeight *buffer = static_cast(_buffer); + const TWeight *const weights = static_cast(_weights); + + const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); + ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; + + for (unsigned int n = 0; n < this->m_args.input_channels; n += vl) + { + const unsigned int todo = std::min(vl, this->m_args.input_channels - n); + + // Copy each of the weights in turn + auto weights_row = weights + n; + for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) + { + auto weights_col = weights_row; + + for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) + { + for (unsigned int m = 0; m < todo; m++) + { + buffer[m] = weights_col[m]; + } + buffer += vl; + + weights_col += ld_weight_col; + } + + weights_row += ld_weight_row; + } + } + } + + size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + { + const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; + return n_threads * (sizeof_input_ptr_array() + + sizeof_output_buffer(n_output_channels) + + sizeof_input_buffer(n_channels)); + } +}; + +template +class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstGenericBase +{ + using Parent = DepthwiseDepthfirstGenericBase; + using TInput = typename Parent::TInput; + using TWeight = typename Parent::TWeight; + using TAccum = typename Parent::TAccum; + using TOutput = typename Parent::TOutput; + + const TAccum *m_bias = nullptr; + + public: + DepthwiseDepthfirstGeneric(const DepthwiseArgs &args) : Parent(args) + { + } + + DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete; + DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete; + + void pack_parameters(void *buffer, const void *bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + m_bias = static_cast(bias); + Parent::pack_parameters(buffer, bias, weights, ld_weight_col, ld_weight_row); + } + + using DepthwiseDepthfirstGenericBase::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + Strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Compute activation values + TAccum activation_min, activation_max; + if (std::numeric_limits::is_integer) + { + activation_min = std::numeric_limits::min(); + activation_max = std::numeric_limits::max(); + } + else + { + activation_min = static_cast(-std::numeric_limits::infinity()); + activation_max = static_cast(std::numeric_limits::infinity()); + } + + switch (this->m_args.activation.type) + { + case arm_gemm::Activation::Type::BoundedReLU: + activation_max = static_cast(this->m_args.activation.param1); + // Fall through + case arm_gemm::Activation::Type::ReLU: + activation_min = static_cast(0); + break; + default: + break; + } + + // Create a function to initialise the input buffer + const auto initialise_input_buffer = [] (TInput *const buffer, const unsigned int n) { + std::memset(buffer, 0, n * sizeof(TInput)); + }; + + // Create a function to execute a tile of work + const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler( + PROFILE_KERNEL, + (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols) + ); +#endif + strat.kernel(inptrs, outptrs, parameters, m_bias, + this->m_args.kernel_rows * this->m_args.kernel_cols, + this->m_args.input_channels, activation_min, activation_max); + }; + + // Call into a parent utility function to do the actual work. + Parent::execute_tiles( + tile_fn, initialise_input_buffer, + batches, input_height, input_width, input_channels, padding, + _input, ld_input_col, ld_input_row, ld_input_batch, + output_height, output_width, + _output, ld_output_col, ld_output_row, ld_output_batch, + _working_space, thread_id, n_threads + ); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp new file mode 100644 index 0000000000..656e4413b2 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_conv { +namespace depthwise { + +template +class DepthwiseDepthfirstGenericWithMultiplierBase : + public DepthwiseCommon +{ + protected: + + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + using TAccum = typename strategy::bias_type; + + unsigned int kernel_points(void) const + { + return this->m_args.kernel_rows * this->m_args.kernel_cols; + } + + unsigned int input_rows(void) const + { + return (strategy::output_rows() - 1) * this->m_args.stride_rows + this->m_args.kernel_rows; + } + + unsigned int input_cols(void) const + { + return (strategy::output_cols() - 1) * this->m_args.stride_cols + this->m_args.kernel_cols; + } + + size_t sizeof_inptr_array(void) const + { + return sizeof(TInput *) * kernel_points() * strategy::output_rows(); + } + + size_t sizeof_input_samples(void) const + { + // We have a sample for each kernel point, for each point of the output array. + return sizeof(TInput) * kernel_points() * + strategy::output_rows() * + strategy::output_col_regs() * + (16 / sizeof(TAccum)); + } + + size_t sizeof_outptr_array(void) const + { + return sizeof(TOutput *) * strategy::output_rows() * strategy::output_cols(); + } + + size_t sizeof_output_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(TOutput) * rounded_channels; + } + + void pack_weights(TWeight *buffer, const TWeight *weights, size_t ld_weight_col, size_t ld_weight_row) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; + + for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++) + { + for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl) + { + const unsigned int out_c = in_c * this->m_args.channel_multiplier + n; + const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n); + + // Copy each of the weights in turn + auto weights_row = weights + out_c; + for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) + { + auto weights_col = weights_row; + + for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) + { + for (unsigned int m = 0; m < todo; m++) + { + buffer[m] = weights_col[m]; + } + buffer += vl; + + weights_col += ld_weight_col; + } + + weights_row += ld_weight_row; + } + } + } + } + + void execute_tiles( + std::function tile_fn, + const TInput pad_value, + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const + { +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Determine what portion of the work to do. + const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); + const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); + const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); + + // Need a stride over blocks of parameters + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const unsigned int param_stride = arm_gemm::roundup(this->m_args.channel_multiplier, vl) * kernel_points(); + + // Cast input and output pointers into the right types + const TInput *const inptr = static_cast(_input); + TOutput *const outptr = static_cast(_output); + + // Allocate portions of the working space + uint8_t *working_space = static_cast(_working_space) + + get_working_size(thread_id, input_channels); + + const TInput **inptrs = reinterpret_cast(working_space); + working_space += sizeof_inptr_array(); + + // To simplify the kernel, we process padded or non-NCHW-ordered input into + // a form which can be consumed by the kernel. This data is stored here and + // passed into the kernel as an array of N pointers (one per row of the + // input). + TInput *rearranged_input = reinterpret_cast(working_space); + working_space += sizeof_input_samples(); + + TOutput **outptr_array = reinterpret_cast(working_space); + working_space += sizeof_outptr_array(); + + TOutput *const output_buffer = reinterpret_cast(working_space); + + // TODO Dynamically change the input pointer array in cases where we could + // read directly from the input tensor; for now though assume we will + // always read from the sample array. + { + auto my_inptrs = inptrs; + auto my_input_samples = rearranged_input; + + // For each kernel point; for each row of output; for each register of + // values containing a QUAD of source values. + const unsigned int quad_length = 16 / sizeof(TAccum); + + for (auto p = 0u; p < kernel_points() * strategy::output_rows(); p++) + { + *(my_inptrs)++ = my_input_samples; + my_input_samples += arm_gemm::roundup(strategy::output_cols(), quad_length); + } + } + + // For each output tile, construct the requisite set of pointers and call + // into the kernel. + for (unsigned int batch = 0; batch < batches; batch++) + { + // Get batch pointers + const auto inptr_batch = inptr + batch * ld_input_batch; + const auto outptr_batch = outptr + batch * ld_output_batch; + + for (int start_out_i = start_out_height; + start_out_i < end_out_height; + start_out_i += static_cast(strategy::output_rows())) + { + const int end_out_i = std::min(start_out_i + static_cast(strategy::output_rows()), end_out_height); + const int start_in_i = start_out_i * this->m_args.stride_rows - padding.top; + const int end_in_i = start_in_i + input_rows(); + + // Compute top/bottom padding + const auto pad_top = static_cast(-std::min(start_in_i, 0)); + const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); + const unsigned int valid_output_rows = std::min( + end_out_i - start_out_i, + static_cast(output_height) - start_out_i + ); + + const int pad_rows = pad_top + pad_bottom; + + for (int start_out_j = 0; start_out_j < static_cast(output_width);) + { + const int start_in_j = start_out_j * this->m_args.stride_cols - this->m_args.padding.left; + const int pad_left = -std::min(0, start_in_j); + + const int end_out_j = start_out_j + strategy::output_cols(); + const int end_in_j = start_in_j + input_cols(); + + const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); + const unsigned int valid_output_cols = std::min( + end_out_j - start_out_j, + static_cast(output_width) - start_out_j + ); + + const int pad_cols = pad_left + pad_right; + + // Construct the output pointer array. + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < valid_output_rows; i++) + { + unsigned int j = 0u; + TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; + for (; j < valid_output_cols; j++) + { + *(outptr_pos++) = colptr; + colptr += ld_output_col; + } + for (; j < strategy::output_cols(); j++) + { + *(outptr_pos++) = output_buffer; + } + } + for (auto i = valid_output_rows; i < strategy::output_rows(); i++) + { + for (auto j = 0u; j < strategy::output_cols(); j++) + { + *(outptr_pos++) = output_buffer; + } + } + + start_out_j += strategy::output_cols(); + + const TWeight *params = static_cast(parameters); + + // Fill the input samples with padding. We can do this outside of + // the channel loop, as the position of padding isn't going to + // change as a function of channel. + for (auto i = 0u; i < kernel_points() * strategy::output_rows() * strategy::output_cols(); i++) + { + rearranged_input[i] = pad_value; + } + + // Loop over the input channels + for (unsigned int in_c = 0; in_c < input_channels; in_c++) + { + auto inptr_row = inptr_batch + in_c + + (start_in_i + pad_top) * ld_input_row + + (start_in_j + pad_left) * ld_input_col; + + // Construct the array of input samples; for each point of the + // kernel we provide an input value for each output point. + auto input_samples = rearranged_input; + for (auto ki = 0u; ki < this->m_args.kernel_rows; ki++) + { + for (auto kj = 0u; kj < this->m_args.kernel_cols; kj++) + { + // Copy the pointer for the input samples associated with this + // kernel point. Then update the main pointer to account for + // this point. + auto point_input_samples = input_samples; + input_samples += strategy::output_rows() * strategy::output_cols(); + + int ii = static_cast(ki) - static_cast(pad_top); + for (auto oi = 0u; + oi < strategy::output_rows() && + ii < static_cast(input_rows()) - pad_rows; + oi++, ii += this->m_args.stride_rows) + { + if (0 <= ii) // Fill in values only if this row is in range. + { + int ij = static_cast(kj) - static_cast(pad_left); + for (auto oj = 0u; + oj < strategy::output_cols() && + ij < static_cast(input_cols()) - pad_cols; + oj++, ij += this->m_args.stride_cols) + { + if (0 <= ij) // Sample if the point is in range. + { + point_input_samples[oj] = *(inptr_row + ii*ld_input_row + ij*ld_input_col); + } + } + } + + point_input_samples += strategy::output_cols(); + } + } + } + + tile_fn(inptrs, outptr_array, params, in_c, in_c*this->m_args.channel_multiplier); + + // Progress the output pointers + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < strategy::output_rows() * strategy::output_cols(); i++) + { + outptr_pos[i] += this->m_args.channel_multiplier; + } + + // Progress the pointer into the parameters + params += param_stride; + } + } + } + } + } + + public: + DepthwiseDepthfirstGenericWithMultiplierBase(const DepthwiseArgs &args) : DepthwiseCommon(args) + { + } + + DepthwiseDepthfirstGenericWithMultiplierBase(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete; + DepthwiseDepthfirstGenericWithMultiplierBase &operator=(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete; + + size_t get_storage_size(void) const override + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl); + return kernel_points() * rounded_channels * sizeof(TWeight); + } + + size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + { + const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; + return n_threads * (sizeof_inptr_array() + + sizeof_input_samples() + + sizeof_outptr_array() + + sizeof_output_buffer(n_output_channels)); + } +}; + +template +class DepthwiseDepthfirstGenericWithMultiplier : public DepthwiseDepthfirstGenericWithMultiplierBase +{ + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + using TAccum = typename strategy::bias_type; + + using Parent = DepthwiseDepthfirstGenericWithMultiplierBase; + + const TAccum *m_biases; // Pointer to bias vector + + public: + DepthwiseDepthfirstGenericWithMultiplier(const DepthwiseArgs &args) + : Parent(args), m_biases(nullptr) + { + } + + DepthwiseDepthfirstGenericWithMultiplier(DepthwiseDepthfirstGenericWithMultiplier &) = delete; + DepthwiseDepthfirstGenericWithMultiplier &operator=(DepthwiseDepthfirstGenericWithMultiplier &) = delete; + + void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + m_biases = static_cast(biases); + Parent::pack_weights(static_cast(buffer), static_cast(weights), ld_weight_col, ld_weight_row); + } + + using DepthwiseDepthfirstGenericWithMultiplierBase::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Compute activation values + TAccum activation_min, activation_max; + if (std::numeric_limits::is_integer) + { + activation_min = std::numeric_limits::min(); + activation_max = std::numeric_limits::max(); + } + else + { + activation_min = static_cast(-std::numeric_limits::infinity()); + activation_max = static_cast(std::numeric_limits::infinity()); + } + + switch (this->m_args.activation.type) + { + case arm_gemm::Activation::Type::BoundedReLU: + activation_max = static_cast(this->m_args.activation.param1); + // Fall through + case arm_gemm::Activation::Type::ReLU: + activation_min = static_cast(0); + break; + default: + break; + } + + // Get a function to call for each point of the output + auto tile_fn = [&] (const TInput **inptrs, + TOutput **outptrs, + const TWeight *weights, + const unsigned int, + const unsigned int start_output_channel) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols)); +#endif + strat.kernel( + inptrs, outptrs, weights, + m_biases ? m_biases + start_output_channel : nullptr, + this->kernel_points(), this->m_args.channel_multiplier, + activation_min, activation_max + ); + }; + + Parent::execute_tiles( + tile_fn, 0.0f, + batches, input_height, input_width, input_channels, padding, + _input, ld_input_col, ld_input_row, ld_input_batch, + parameters, + output_height, output_width, + _output, ld_output_col, ld_output_row, ld_output_batch, + _working_space, thread_id, n_threads + ); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp new file mode 100644 index 0000000000..d42382e208 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +#include "depthwise_depthfirst_generic_multiplier.hpp" + +namespace arm_conv { +namespace depthwise { + +template +class DepthwiseDepthfirstGenericWithMultiplierQuantized : public DepthwiseDepthfirstGenericWithMultiplierBase +{ + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + using TAccum = typename strategy::bias_type; + + using Parent = DepthwiseDepthfirstGenericWithMultiplierBase; + + arm_gemm::Requantize32 m_qp; + + public: + DepthwiseDepthfirstGenericWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp) + : Parent(args), m_qp(qp) + { + } + + DepthwiseDepthfirstGenericWithMultiplierQuantized(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete; + DepthwiseDepthfirstGenericWithMultiplierQuantized &operator=(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete; + + void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + m_qp.bias = static_cast(biases); + Parent::pack_weights(static_cast(buffer), static_cast(weights), ld_weight_col, ld_weight_row); + } + + using Parent::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Get a function to call for each point of the output + auto tile_fn = [&] (const TInput **inptrs, + TOutput **outptrs, + const TWeight *weights, + const unsigned int, + const unsigned int start_output_channel) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols)); +#endif + strat.kernel( + inptrs, outptrs, weights, + m_qp.bias == nullptr ? nullptr : m_qp.bias + start_output_channel, + this->kernel_points(), + this->m_args.channel_multiplier, + m_qp.per_channel_left_shifts == nullptr ? nullptr : m_qp.per_channel_left_shifts + start_output_channel, + m_qp.per_channel_muls == nullptr ? nullptr : m_qp.per_channel_muls + start_output_channel, + m_qp.per_channel_right_shifts == nullptr ? nullptr : m_qp.per_channel_right_shifts + start_output_channel, + m_qp + ); + }; + + Parent::execute_tiles( + tile_fn, m_qp.a_offset, + batches, input_height, input_width, input_channels, padding, + _input, ld_input_col, ld_input_row, ld_input_batch, + parameters, + output_height, output_width, + _output, ld_output_col, ld_output_row, ld_output_batch, + _working_space, thread_id, n_threads + ); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp new file mode 100644 index 0000000000..cfb0d4bc05 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "depthwise_depthfirst_generic.hpp" + +#include "arm_gemm.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +using arm_gemm::Requantize32; + +namespace arm_conv { +namespace depthwise { + +template +class DepthwiseDepthfirstGenericQuantized : public DepthwiseDepthfirstGenericBase +{ + using Parent = DepthwiseDepthfirstGenericBase; + using TInput = typename Parent::TInput; + using TAccum = typename Parent::TAccum; + using TOutput = typename Parent::TOutput; + + Requantize32 m_qp; + + public: + DepthwiseDepthfirstGenericQuantized(const DepthwiseArgs &args, const Requantize32 &qp) + : Parent(args), m_qp(qp) + { + } + + DepthwiseDepthfirstGenericQuantized(DepthwiseDepthfirstGenericQuantized &) = delete; + DepthwiseDepthfirstGenericQuantized &operator=(DepthwiseDepthfirstGenericQuantized &) = delete; + + void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + m_qp.bias = static_cast(biases); + Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row); + } + + using DepthwiseDepthfirstGenericBase::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + Strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Create a function to initialise the input buffer + const auto initialise_input_buffer = [this] (TInput *const buffer, const unsigned int n) { + std::memset(buffer, static_cast(m_qp.a_offset), n * sizeof(TInput)); + }; + + // Create a function to execute a tile of work + const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler( + PROFILE_KERNEL, + (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols) + ); +#endif + strat.kernel(inptrs, outptrs, parameters, m_qp, + this->m_args.kernel_rows * this->m_args.kernel_cols, + this->m_args.input_channels); + }; + + // Call into a parent utility function to do the actual work. + Parent::execute_tiles( + tile_fn, initialise_input_buffer, + batches, input_height, input_width, input_channels, padding, + _input, ld_input_col, ld_input_row, ld_input_batch, + output_height, output_width, + _output, ld_output_col, ld_output_row, ld_output_batch, + _working_space, thread_id, n_threads + ); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp new file mode 100644 index 0000000000..7c64e0be61 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_conv { +namespace depthwise { + +namespace common +{ + template + void depthwise_multiplier_execute( + const F execute_tile, + typename strategy::input_type pad_value, + const DepthwiseArgs &args, + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const size_t param_stride, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) + { + using TInput = typename strategy::input_type; + using TOutput = typename strategy::return_type; + + // Determine what portion of the work to do. + const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); + const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); + const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); + + // Cast input and output pointers into the right types + const TInput *const inptr = static_cast(_input); + TOutput *const outptr = static_cast(_output); + + // To simplify the kernel, we process padded or non-NCHW-ordered input into + // a form which can be consumed by the kernel. This data is stored here and + // passed into the kernel as an array of N pointers (one per row of the + // input). + TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))]; + const TInput *inptrs[strategy::input_rows]; + + // Create an array for the output pointers + TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; + TOutput **const outptr_array = _outptr_array; + + // Allocate portions of the working space + uint8_t *const working_space = static_cast(_working_space); + TOutput *const output_buffer = reinterpret_cast(working_space); + + // For each output tile, construct the requisite set of pointers and call + // into the kernel. + for (unsigned int batch = 0; batch < batches; batch++) + { + // Get batch pointers + const auto inptr_batch = inptr + batch * ld_input_batch; + const auto outptr_batch = outptr + batch * ld_output_batch; + + for (int start_out_i = start_out_height; + start_out_i < end_out_height; + start_out_i += static_cast(strategy::output_rows)) + { + const int end_out_i = start_out_i + strategy::output_rows; + const int start_in_i = start_out_i * strategy::stride_rows - padding.top; + const int end_in_i = start_in_i + strategy::input_rows; + + // Compute top/bottom padding + const auto pad_top = static_cast(-std::min(start_in_i, 0)); + const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); + const unsigned int valid_output_rows = std::min( + end_out_i - start_out_i, + static_cast(output_height) - start_out_i + ); + + for (int start_out_j = 0; start_out_j < static_cast(output_width);) + { + const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left; + const int pad_left = -std::min(0, start_in_j); + + const int end_out_j = start_out_j + strategy::output_cols; + const int end_in_j = start_in_j + strategy::input_cols; + + const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); + const unsigned int valid_output_cols = std::min( + end_out_j - start_out_j, + static_cast(output_width) - start_out_j + ); + + // Construct the output pointer array. + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < valid_output_rows; i++) + { + unsigned int j = 0u; + TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; + for (; j < valid_output_cols; j++) + { + *(outptr_pos++) = colptr; + colptr += ld_output_col; + } + for (; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + for (auto i = valid_output_rows; i < strategy::output_rows; i++) + { + for (auto j = 0u; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + + start_out_j += strategy::output_cols; + + const uint8_t *params = static_cast(parameters); + + // Loop over the input channels + for (unsigned int in_c = 0; in_c < input_channels; in_c++) + { + // Construct the input array - first fill with padding values and + // then fill in correct values. + for (unsigned int i = 0; i < strategy::input_rows; i++) + { + for (unsigned int j = 0; + j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++) + { + rearranged_input[i][j] = pad_value; + } + inptrs[i] = rearranged_input[i]; + } + + auto inptr_row = inptr_batch + in_c + + (start_in_i + pad_top) * ld_input_row + + (start_in_j + pad_left) * ld_input_col; + if (ld_input_col == 1 && !pad_left && + start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width) + { + // The input tensor is already in NCHW format, and we're reading + // an unpadded section of it - allow the kernel to read it + // directly. + for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) + { + inptrs[i] = inptr_row; + inptr_row += ld_input_row; + } + } + else + { + // Either the input tensor isn't in NCHW format, or we're reading + // a padded section. Copy the relevant portion of the input here + // and allow the kernel to read this. + for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) + { + auto inptr_col = inptr_row; + for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++) + { + rearranged_input[i][j] = *inptr_col; + inptr_col += ld_input_col; + } + inptr_row += ld_input_row; + } + } + + execute_tile(inptrs, outptr_array, params); + + // Progress the output pointers + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++) + { + outptr_pos[i] += args.channel_multiplier; + } + + // Progress the pointer into the parameters + params += param_stride; + } + } + } + } + } +} + +template +class DepthwiseDepthfirstWithMultiplier : + public DepthwiseCommon +{ + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + using TAccum = typename strategy::bias_type; + + size_t sizeof_output_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(TOutput) * rounded_channels; + } + + public: + DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon(args) + { + } + + DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete; + DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete; + + size_t get_storage_size(void) const override + { + // TODO What if we insert extra padding? Biases are a different size to the inputs, ... + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl); + return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight); + } + + void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + { + // TODO What if the kernel needs a different packing function? + + // Cast the pointers + float *buffer = static_cast(_buffer); + const float *biases = static_cast(_biases); + const float *const weights = static_cast(_weights); + + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; + + for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++) + { + for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl) + { + const unsigned int out_c = in_c * this->m_args.channel_multiplier + n; + const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n); + + // Copy across the correct amount of bias (or 0) + for (unsigned int i = 0; i < todo; i++) + { + buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i]; + } + buffer += vl; + + // Copy each of the weights in turn + auto weights_row = weights + out_c; + for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) + { + auto weights_col = weights_row; + + for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) + { + for (unsigned int m = 0; m < todo; m++) + { + buffer[m] = weights_col[m]; + } + buffer += vl; + + weights_col += ld_weight_col; + } + + weights_row += ld_weight_row; + } + } + } + } + + size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + { + const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; + return n_threads * sizeof_output_buffer(n_output_channels); + } + + using DepthwiseCommon::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + // Compute activation values + TAccum activation_min = std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() : std::numeric_limits::min(); + TAccum activation_max = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); + + switch (this->m_args.activation.type) + { + case arm_gemm::Activation::Type::BoundedReLU: + activation_max = static_cast(this->m_args.activation.param1); + // Fall through + case arm_gemm::Activation::Type::ReLU: + activation_min = static_cast(0); + break; + default: + break; + } + + // Determine what portion of the work to do. + const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); + const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); + const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); + + // Need a stride over blocks of parameters + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const unsigned int param_stride = + arm_gemm::roundup(this->m_args.channel_multiplier, vl) * + (sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols); + + // Cast input and output pointers into the right types + const TInput *const inptr = static_cast(_input); + TOutput *const outptr = static_cast(_output); + + // To simplify the kernel, we process padded or non-NCHW-ordered input into + // a form which can be consumed by the kernel. This data is stored here and + // passed into the kernel as an array of N pointers (one per row of the + // input). + TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4]; + const TInput *inptrs[strategy::input_rows]; + + // Create an array for the output pointers + TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; + TOutput **const outptr_array = _outptr_array; + + // Allocate portions of the working space + uint8_t *const working_space = static_cast(_working_space) + get_working_size(thread_id, input_channels); + TOutput *const output_buffer = reinterpret_cast(working_space); + + // For each output tile, construct the requisite set of pointers and call + // into the kernel. + for (unsigned int batch = 0; batch < batches; batch++) + { + // Get batch pointers + const auto inptr_batch = inptr + batch * ld_input_batch; + const auto outptr_batch = outptr + batch * ld_output_batch; + + for (int start_out_i = start_out_height; + start_out_i < end_out_height; + start_out_i += static_cast(strategy::output_rows)) + { + const int end_out_i = start_out_i + strategy::output_rows; + const int start_in_i = start_out_i * strategy::stride_rows - padding.top; + const int end_in_i = start_in_i + strategy::input_rows; + + // Compute top/bottom padding + const auto pad_top = static_cast(-std::min(start_in_i, 0)); + const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); + const unsigned int valid_output_rows = std::min( + end_out_i - start_out_i, + static_cast(output_height) - start_out_i + ); + + for (int start_out_j = 0; start_out_j < static_cast(output_width);) + { + const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left; + const int pad_left = -std::min(0, start_in_j); + + const int end_out_j = start_out_j + strategy::output_cols; + const int end_in_j = start_in_j + strategy::input_cols; + + const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); + const unsigned int valid_output_cols = std::min( + end_out_j - start_out_j, + static_cast(output_width) - start_out_j + ); + + // Construct the output pointer array. + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < valid_output_rows; i++) + { + unsigned int j = 0u; + TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; + for (; j < valid_output_cols; j++) + { + *(outptr_pos++) = colptr; + colptr += ld_output_col; + } + for (; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + for (auto i = valid_output_rows; i < strategy::output_rows; i++) + { + for (auto j = 0u; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + + start_out_j += strategy::output_cols; + + const uint8_t *params = static_cast(parameters); + + // Loop over the input channels + for (unsigned int in_c = 0; in_c < input_channels; in_c++) + { + // Construct the input array - first fill with padding values and + // then fill in correct values. + for (unsigned int i = 0; i < strategy::input_rows; i++) + { + for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++) + { + rearranged_input[i][j] = static_cast(0); + } + inptrs[i] = rearranged_input[i]; + } + + auto inptr_row = inptr_batch + in_c + + (start_in_i + pad_top) * ld_input_row + + (start_in_j + pad_left) * ld_input_col; + if (ld_input_col == 1 && !pad_left && + start_in_j + 4 * strategy::input_col_quads < input_width) + { + // The input tensor is already in NCHW format, and we're reading + // an unpadded section of it - allow the kernel to read it + // directly. + for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) + { + inptrs[i] = inptr_row; + inptr_row += ld_input_row; + } + } + else + { + // Either the input tensor isn't in NCHW format, or we're reading + // a padded section. Copy the relevant portion of the input here + // and allow the kernel to read this. + for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) + { + auto inptr_col = inptr_row; + for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++) + { + rearranged_input[i][j] = *inptr_col; + inptr_col += ld_input_col; + } + inptr_row += ld_input_row; + } + } + + { +#ifdef CYCLE_PROFILING + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols)); +#endif + strat.kernel( + inptrs, outptr_array, params, + this->m_args.channel_multiplier, + activation_min, activation_max + ); + } + + // Progress the output pointers + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++) + { + outptr_pos[i] += this->m_args.channel_multiplier; + } + + // Progress the pointer into the parameters + params += param_stride; + } + } + } + } + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp new file mode 100644 index 0000000000..07ce0d3b55 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "depthwise_depthfirst_multiplier.hpp" + +namespace arm_conv { +namespace depthwise { + +template +class DepthwiseDepthfirstWithMultiplierQuantized : + public DepthwiseCommon +{ + using Parent = DepthwiseCommon; + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + + const arm_gemm::Requantize32 m_qp; + + size_t sizeof_output_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(typename strategy::return_type) * rounded_channels; + } + + public: + DepthwiseDepthfirstWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp) + : Parent(args), m_qp(qp) + { + } + + DepthwiseDepthfirstWithMultiplierQuantized(DepthwiseDepthfirstWithMultiplierQuantized &) = delete; + DepthwiseDepthfirstWithMultiplierQuantized &operator=(DepthwiseDepthfirstWithMultiplierQuantized &) = delete; + + size_t get_storage_size(void) const override + { + // We produce VL channels at a time, for each of these blocks of + // channels we store a vector of biases, weights (complicated) and + // requantize parameters. + const unsigned int iter_length = + arm_gemm::utils::get_vector_length(strategy::vl_type); + const unsigned int n_iters = + this->m_args.input_channels * arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length); + + // Compute the cost of storing the weights + const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u); + + return n_iters * iter_length * ( + sizeof(int32_t) + // Bias + 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(TWeight) + // Weights + 2 * sizeof(int32_t) // Requantisation parameters + ); + } + + // We'll want an optimised version of this, but for now a C++ implementation + // is probably sufficient. + void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + { + auto buffer = static_cast(_buffer); + auto biases = static_cast(_biases); + auto weights = static_cast(_weights); + auto requant_muls = m_qp.per_channel_muls; + auto requant_shifts = m_qp.per_channel_right_shifts; + + const unsigned int iter_length = + arm_gemm::utils::get_vector_length(strategy::vl_type); + const unsigned int n_iters_per_input_channel = + arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length); + + const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u); + + const size_t iter_stride = iter_length * ( + sizeof(int32_t) + // Bias + 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights + 2 * sizeof(int32_t) // Requantisation parameters + ); + + ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels * this->m_args.channel_multiplier : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; + + for (unsigned int input_channel = 0; input_channel < this->m_args.input_channels; input_channel++) + { + auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride; + auto weights_input_channel = weights + input_channel * this->m_args.channel_multiplier; + + for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++) + { + // Get a pointer to the start of this portion of the buffer; consequently + // derive pointers to the bias, weight and requantisation portions of + // this frame. + auto buffer_base = buffer_input_channel + iter_stride * iter; + auto buffer_biases = reinterpret_cast(buffer_base); + auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length; + auto buffer_requant_mul = reinterpret_cast( + buffer_weights + strategy::kernel_rows * n_dots_per_kernel_row * 4 * iter_length); + auto buffer_requant_shift = buffer_requant_mul + iter_length; + auto weights_base = weights_input_channel + iter * iter_length; + + // Hence work through the data for this iteration, on a + // channel-by-channel basis. + const auto this_iter_length = std::min( + iter_length, this->m_args.channel_multiplier - iter * iter_length + ); + for (unsigned int i = 0; i < this_iter_length; i++) + { + auto weights_channel = weights_base + i; + + // Read the bias value, we modify this as we read the weights. + auto bias_value = biases == nullptr ? 0 : *(biases++); + int32_t elements_sum = 0; + + // Read through the kernel; for each row, marshal together as many dot + // product terms as are required. + for (unsigned int ki = 0; ki < strategy::kernel_rows; ki++) + { + auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length; + auto weights_row = weights_channel + ki * ld_weight_row; + + unsigned int kj = 0; + for (; kj < strategy::kernel_cols; kj++) + { + // Determine which element to which we're writing + const auto dot = kj / 4; + const auto elem = kj % 4; + + // Copy the value; include in the sum + const auto val = weights_row[kj * ld_weight_col]; + buffer_row[dot * 4 * iter_length + elem] = val; + elements_sum += val; + } + for (; kj < 4 * n_dots_per_kernel_row; kj++) + { + const auto dot = kj / 4; + const auto elem = kj % 4; + buffer_row[dot * 4 * iter_length + elem] = 0; + } + + buffer_row += 4 * n_dots_per_kernel_row * iter_length; + } + + // Write back the bias and offset values + *(buffer_biases++) = + bias_value - m_qp.a_offset * elements_sum + + strategy::kernel_rows * strategy::kernel_cols * m_qp.a_offset * m_qp.b_offset; + + // Write out the requantisation parameters + *(buffer_requant_mul++) = m_qp.per_channel_requant ? *(requant_muls++) : m_qp.per_layer_mul; + *(buffer_requant_shift++) = m_qp.per_channel_requant ? *(requant_shifts++) : m_qp.per_layer_right_shift; + } + } + } + } + + size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + { + const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; + return n_threads * sizeof_output_buffer(n_output_channels); + } + + using Parent::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *const _working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + + auto executefn = [strat, this] ( + const TInput *const *const inptrs, + TOutput *const *const outptr_array, + const void *const params + ) { + strat.kernel(inptrs, outptr_array, params, this->m_args.channel_multiplier, m_qp); + }; + + // Get working space for this thread + uint8_t *const working_space = static_cast(_working_space) + get_working_size(1, input_channels) * thread_id; + + // Determine the stride across blocks of parameters + const unsigned int iter_length = + arm_gemm::utils::get_vector_length(strategy::vl_type); + const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length); + const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u); + const size_t param_stride = n_iters_per_input_channel * iter_length * ( + sizeof(int32_t) + // Bias + 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights + 2 * sizeof(int32_t) // Requantisation parameters + ); + + common::depthwise_multiplier_execute( + executefn, m_qp.a_offset, this->m_args, + batches, input_height, input_width, input_channels, padding, + _input, ld_input_col, ld_input_row, ld_input_batch, + parameters, param_stride, + output_height, output_width, + _output, ld_output_col, ld_output_row, ld_output_batch, + working_space, thread_id, n_threads + ); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp new file mode 100644 index 0000000000..f97569e958 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#ifdef CYCLE_PROFILING +#include "profiler.hpp" +#endif + +namespace arm_conv { +namespace depthwise { + +namespace +{ + +// We have two sets of quantized kernels; those which use the dot-product +// instructions and which require the biases and quantisation parameters to be +// ravelled into weights/parameter array, and those which use the MLAL +// instructions and which consume separate bias and quantisation parameter +// arrays. The following code adapts these two sets of kernels to use the same +// API - allowing the same driver loop to call them both. + +template +using UnravelledKernFn = std::function; + +template +using RavelledKernFn = std::function; + +template +const UnravelledKernFn get_unified_kernel(const UnravelledKernFn &f) { return f; } + +template +const UnravelledKernFn get_unified_kernel(const RavelledKernFn &f) +{ + return [f] (const unsigned int n_channels, + const TIn *const *const inptrs, + const TWeight *const weights, + const int32_t *, // Bias (ravelled) + const arm_gemm::Requantize32 &qp, + const int32_t *, // Requantisation muls (ravelled) + const int32_t *, // Requantisation shifts (ravelled) + TOut *const *const outptrs) { + return f(inptrs, outptrs, weights, n_channels, qp); + }; +} + +template +using UnravelledPackingFn = std::function; + +template +using RavelledPackingFn = std::function; + +template +const RavelledPackingFn get_unified_packer(const UnravelledPackingFn &f) +{ + return [f] (const unsigned int n_channels, + void *buffer, + const int32_t *, // Bias + const T *weights, + const arm_gemm::Requantize32 &, + size_t ld_weight_col, + size_t ld_weight_row) + { + return f(n_channels, buffer, weights, ld_weight_col, ld_weight_row); + }; +} + +template +const RavelledPackingFn get_unified_packer(const RavelledPackingFn &f) { return f; } + +template +constexpr bool requires_unravelled_bias_and_quant_params(const UnravelledPackingFn &) { return true; } + +template +constexpr bool requires_unravelled_bias_and_quant_params(const RavelledPackingFn &) { return false; } + +template +constexpr bool strategy_requires_unravelled_bias_and_quant_params(void) +{ + return requires_unravelled_bias_and_quant_params(strategy::pack_parameters); +} + +} + +template +class DepthwiseDepthfirstQuantized : + public DepthwiseCommon +{ + using TInput = typename strategy::input_type; + using TWeight = typename strategy::weight_type; + using TOutput = typename strategy::return_type; + using TAccum = typename strategy::bias_type; + + arm_gemm::Requantize32 m_qp; + + size_t sizeof_input_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(TInput) * rounded_channels; + } + + size_t sizeof_output_buffer(unsigned int n_channels) const + { + const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); + const auto rounded_channels = arm_gemm::roundup(n_channels, vl); + return sizeof(TOutput) * rounded_channels; + } + + size_t sizeof_bias_buffer(unsigned int n_channels) const + { + if (strategy_requires_unravelled_bias_and_quant_params()) + { + return (m_qp.bias == nullptr) ? sizeof(TAccum) * n_channels : 0; + } + + return 0; + } + + size_t sizeof_requant_mul_buffer(unsigned int n_channels) const + { + if (strategy_requires_unravelled_bias_and_quant_params()) + { + return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels; + } + + return 0; + } + + size_t sizeof_requant_shift_buffer(unsigned int n_channels) const + { + if (strategy_requires_unravelled_bias_and_quant_params()) + { + return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels; + } + + return 0; + } + + public: + DepthwiseDepthfirstQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp) + : DepthwiseCommon(args), m_qp(qp) + { + } + + DepthwiseDepthfirstQuantized(DepthwiseDepthfirstQuantized &) = delete; + DepthwiseDepthfirstQuantized &operator=(DepthwiseDepthfirstQuantized &) = delete; + + size_t get_storage_size(void) const override + { + return strategy::get_packed_size(this->m_args); + } + + void pack_parameters(void *buffer, const void *const bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + if (strategy_requires_unravelled_bias_and_quant_params()) + { + m_qp.bias = static_cast(bias); + } + + get_unified_packer(strategy::pack_parameters)( + this->m_args.input_channels, + buffer, + static_cast(bias), + reinterpret_cast(weights), + m_qp, + ld_weight_col, + ld_weight_row + ); + } + + size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + { + const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; + return n_threads * ( + sizeof_output_buffer(n_output_channels) + + sizeof_input_buffer(n_channels) + + sizeof_bias_buffer(n_channels) + + sizeof_requant_mul_buffer(n_channels) + + sizeof_requant_shift_buffer(n_channels) + ); + } + + using DepthwiseCommon::execute; + void execute( + const unsigned int batches, + const unsigned int input_height, + const unsigned int input_width, + const unsigned int input_channels, + const PaddingValues &padding, + const void *const _input, + const size_t ld_input_col, + const size_t ld_input_row, + const size_t ld_input_batch, + const void *const parameters, + const unsigned int output_height, + const unsigned int output_width, + void *const _output, + const size_t ld_output_col, + const size_t ld_output_row, + const size_t ld_output_batch, + void *_working_space, + const unsigned int thread_id, + const unsigned int n_threads + ) const override + { + strategy strat(this->m_args.cpu_info); +#ifdef CYCLE_PROFILING + arm_gemm::profiler prof; +#endif + // Get a unified API for the kernel function + auto kernel = get_unified_kernel(strat.kernel); + + // Determine what portion of the work to do. + const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); + const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); + const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); + + // Cast input and output pointers into the right types + const TInput *const inptr = static_cast(_input); + TOutput *const outptr = static_cast(_output); + + // Create an array for the input pointers + const TInput * _inptr_array[strategy::input_rows * strategy::input_cols]; + const TInput **const inptr_array = _inptr_array; + + // Create an array for the output pointers + TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; + TOutput **const outptr_array = _outptr_array; + + // Allocate portions of the working space + uint8_t *working_space = static_cast(_working_space) + get_working_size(thread_id, input_channels); + + TOutput *const output_buffer = reinterpret_cast(working_space); + working_space += sizeof_output_buffer(input_channels * this->m_args.channel_multiplier); + + TInput *const input_buffer = reinterpret_cast(working_space); + working_space += sizeof_input_buffer(input_channels); + + const int32_t *const bias_ptr = (m_qp.bias == nullptr) ? reinterpret_cast(working_space) + : m_qp.bias; + working_space += sizeof_bias_buffer(input_channels * this->m_args.channel_multiplier); + + const int32_t *const requant_mul_vec = !m_qp.per_channel_requant ? reinterpret_cast(working_space) + : m_qp.per_channel_muls; + working_space += sizeof_requant_mul_buffer(input_channels * this->m_args.channel_multiplier); + + const int32_t *const requant_shift_vec = !m_qp.per_channel_requant ? reinterpret_cast(working_space) + : m_qp.per_channel_right_shifts; + + if (strategy_requires_unravelled_bias_and_quant_params()) + { + // Initialise the bias buffer + if (m_qp.bias == nullptr) + { + for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++) + { + const_cast(bias_ptr)[c] = 0; + } + } + + // Initialise the requantisation parameters + if (!m_qp.per_channel_requant) + { + for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++) + { + const_cast(requant_mul_vec)[c] = m_qp.per_layer_mul; + const_cast(requant_shift_vec)[c] = m_qp.per_layer_right_shift; + } + } + } + + // Initialise the input buffer + for (unsigned int c = 0; c < input_channels; c++) + { + input_buffer[c] = static_cast(m_qp.a_offset); + } + + // For each output tile, construct the requisite set of pointers and call + // into the kernel. + for (unsigned int batch = 0; batch < batches; batch++) + { + // Get batch pointers + const auto inptr_batch = inptr + batch * ld_input_batch; + const auto outptr_batch = outptr + batch * ld_output_batch; + + for (int start_out_i = start_out_height; + start_out_i < end_out_height; + start_out_i += static_cast(strategy::output_rows)) + { + const int end_out_i = start_out_i + strategy::output_rows; + const int start_in_i = start_out_i * strategy::stride_rows - padding.top; + const int end_in_i = start_in_i + strategy::input_rows; + + // Compute top/bottom padding + const auto pad_top = static_cast(-std::min(start_in_i, 0)); + const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); + const unsigned int valid_output_rows = std::min( + end_out_i - start_out_i, + static_cast(output_height) - start_out_i + ); + + // Fill the input pointer array with padding values + for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++) + { + inptr_array[index] = input_buffer; + } + + for (int start_out_j = 0; start_out_j < static_cast(output_width);) + { + const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left; + const int pad_left = -std::min(0, start_in_j); + + const int end_out_j = start_out_j + strategy::output_cols; + const int end_in_j = start_in_j + strategy::input_cols; + + const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); + const unsigned int valid_output_cols = std::min( + end_out_j - start_out_j, + static_cast(output_width) - start_out_j + ); + + // Construct the input pointer array - fill the array with pointers to + // the input buffer and then fill in the required values. + for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++) + { + // Can skip over the left padding because we will have either the + // same or less than the previous tile. + unsigned int j = pad_left; + const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col; + const TInput **ptrs = inptr_array + i * strategy::input_cols + j; + for (; j < strategy::input_cols - pad_right; j++) + { + *(ptrs++) = colptr; + colptr += ld_input_col; + } + for (; j < strategy::input_cols; j++) + { + *(ptrs++) = input_buffer; + } + } + + // Construct the output pointer array. + TOutput **outptr_pos = outptr_array; + for (auto i = 0u; i < valid_output_rows; i++) + { + unsigned int j = 0u; + TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; + for (; j < valid_output_cols; j++) + { + *(outptr_pos++) = colptr; + colptr += ld_output_col; + } + for (; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + for (auto i = valid_output_rows; i < strategy::output_rows; i++) + { + for (auto j = 0u; j < strategy::output_cols; j++) + { + *(outptr_pos++) = output_buffer; + } + } + + start_out_j += strategy::output_cols; + +#ifdef CYCLE_PROFILING + // TODO Work number + auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.kernel_rows * this->m_args.kernel_cols)); +#endif + kernel( + this->m_args.input_channels, + inptr_array, + reinterpret_cast(parameters), + bias_ptr, m_qp, requant_mul_vec, requant_shift_vec, + outptr_array + ); + } + } + } + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp new file mode 100644 index 0000000000..fdb36fc1d1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm_local.hpp" + +#include "depthwise_implementation.hpp" +#include "depthwise_depthfirst.hpp" +#include "depthwise_depthfirst_generic.hpp" +#include "depthwise_depthfirst_multiplier.hpp" +#include "depthwise_depthfirst_generic_multiplier.hpp" + +#include "depthwise_implementation_constraints.hpp" + +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) +#include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp" +#include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp" +#include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#endif // defined(__ARM_FEATURE_SVE) +#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp" +#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp" +#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp" +#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +namespace +{ + template + unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &) + { + // First-pass: compute the number of output pixels which will be computed. + return arm_gemm::roundup(args.output_rows, Strategy::output_rows) * + arm_gemm::roundup(args.output_cols, Strategy::output_cols) * + arm_gemm::iceildiv( + (long unsigned) args.input_channels * args.channel_multiplier, + arm_gemm::utils::get_vector_length(Strategy::vl_type) + ); + } + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) + { + return std::numeric_limits::max(); + } + + unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &) + { + return args.channel_multiplier > 1 ? 0 : std::numeric_limits::max(); + } +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +} + +#if defined(__ARM_FP16_ARGS) + +static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = { +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, +#endif // defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_generic_output3x3_mla_depthfirst", + constraint(has_no_channel_multiplier), + not_preferred, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirstGeneric(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + nullptr, + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { + return new DepthwiseDepthfirstGenericWithMultiplier(args); + }, + }, +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(__aarch64__) + { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list +}; + +template <> +const DepthwiseImplementation<__fp16> *depthwise_implementation_list() +{ + return depthwise_fp16_methods; +} + +template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &); +template std::vector get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &); + +#endif // defined(__ARM_FP16_ARGS) + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp new file mode 100644 index 0000000000..aea750a475 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm_local.hpp" + +#include "depthwise_implementation.hpp" +#include "depthwise_depthfirst.hpp" +#include "depthwise_depthfirst_generic.hpp" +#include "depthwise_depthfirst_multiplier.hpp" +#include "depthwise_depthfirst_generic_multiplier.hpp" + +#include "depthwise_implementation_constraints.hpp" + +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) +#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp" +#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp" +#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp" +#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp" +#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp" +#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(__ARM_FEATURE_SVE) +#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp" +#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp" +#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp" +#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp" +#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp" +#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +namespace +{ + template + unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &) + { + // First-pass: compute the number of output pixels which will be computed. + return arm_gemm::roundup(args.output_rows, Strategy::output_rows) * + arm_gemm::roundup(args.output_cols, Strategy::output_cols) * + arm_gemm::iceildiv( + (long unsigned) args.input_channels * args.channel_multiplier, + arm_gemm::utils::get_vector_length(Strategy::vl_type) + ); + } + + unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) + { + return std::numeric_limits::max(); + } + + unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &) + { + return args.channel_multiplier > 1 ? 0 : std::numeric_limits::max(); + } +} + +static const DepthwiseImplementation depthwise_fp32_methods[] = { +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_generic_output3x3_mla_depthfirst", + constraint(has_no_channel_multiplier), + not_preferred, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGeneric(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst", + constraint(is_supported), + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplier(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst", + constraint(is_supported), + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplier(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + nullptr, + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericWithMultiplier(args); + }, + }, +#endif // defined(__ARM_FEATURE_SVE) + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier), + cycle_estimate, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirst(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_generic_output3x3_mla_depthfirst", + constraint(has_no_channel_multiplier), + not_preferred, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGeneric(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst", + constraint(is_supported), + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplier(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst", + constraint(is_supported), + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplier(args); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + nullptr, + not_preferred_if_no_multiplier, + [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericWithMultiplier(args); + }, + }, +#endif // defined(__aarch64__) + { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list +}; + +template <> +const DepthwiseImplementation *depthwise_implementation_list() +{ + return depthwise_fp32_methods; +} + +template UniqueDepthwiseCommon depthwise(const DepthwiseArgs &, const Nothing &); +template std::vector get_compatible_kernels(const DepthwiseArgs &, const Nothing &); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp new file mode 100644 index 0000000000..1d52b56d36 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/assembly/depthwise.hpp" + +#include +#include + +using arm_gemm::Nothing; + +namespace arm_conv { +namespace depthwise { + +template +struct DepthwiseImplementation +{ + const DepthwiseMethod method; + const char *name; + std::function is_supported; + std::function cycle_estimate; + std::function *(const DepthwiseArgs &, const OutputStage &)> initialise; + + bool get_is_supported(const DepthwiseArgs &args, const OutputStage &os) const + { + return (is_supported == nullptr) ? true : is_supported(args, os); + } + + uint64_t get_cycle_estimate(const DepthwiseArgs &args, const OutputStage &os) const + { + return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os); + } + + DepthwiseCommon *get_instance(const DepthwiseArgs &args, const OutputStage &os) const + { + return initialise(args, os); + } +}; + +template +const DepthwiseImplementation *depthwise_implementation_list(); + +template +bool find_implementation( + const DepthwiseArgs &args, + const OutputStage &os, + const DepthwiseImplementation * &selected +) +{ + selected = nullptr; + uint64_t best_cycle_estimate = UINT64_MAX; + + const auto *impl = depthwise_implementation_list(); + for (; impl->method != DepthwiseMethod::DEFAULT; impl++) + { + const bool has_cfg = (args.config != nullptr); + const auto &cfg = args.config; + + if ( + !impl->get_is_supported(args, os) || // Problem is unsupported + (has_cfg && cfg->method != DepthwiseMethod::DEFAULT && cfg->method != impl->method) || + (has_cfg && cfg->filter != "" && !std::strstr(impl->name, cfg->filter.c_str())) + ) + { + continue; + } + + const auto cycle_estimate = impl->get_cycle_estimate(args, os); + + if (cycle_estimate == 0) + { + selected = impl; + break; + } + + if (selected == nullptr || cycle_estimate < best_cycle_estimate) + { + selected = impl; + best_cycle_estimate = cycle_estimate; + } + } + + return (selected != nullptr); +} + +template +std::vector get_compatible_kernels(const DepthwiseArgs &args, const OutputStage &os) +{ + std::vector kerns; + + // Find the default implementation so we can flag it accordingly + const DepthwiseImplementation *default_impl; + find_implementation(args, os, default_impl); + + for (auto impl = depthwise_implementation_list(); + impl->method != DepthwiseMethod::DEFAULT; impl++) + { + if (!impl->get_is_supported(args, os)) + { + continue; + } + + kerns.emplace_back( + impl->method, impl->name, impl == default_impl, + impl->get_cycle_estimate(args, os) + ); + } + + return kerns; +} + +template +UniqueDepthwiseCommon depthwise(const DepthwiseArgs &args, const OutputStage &os) +{ + const DepthwiseImplementation *impl = nullptr; + const bool success = find_implementation(args, os, impl); + return UniqueDepthwiseCommon(success ? impl->get_instance(args, os) : nullptr); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp new file mode 100644 index 0000000000..b4814bef92 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Utilities for constructing functions which constrain which kernels are + * selected for a given depthwise problem. + * + * It is expected that this will be included in the files which list the + * available kernels. To avoid multiple definitions, an anonymous namespace is + * used. + */ + +#pragma once + +#include "arm_gemm.hpp" +#include "depthwise.hpp" + +namespace arm_conv +{ +namespace depthwise +{ +namespace +{ + +template +using ConstraintFn = std::function; + +using GenericConstraintFn = std::function; + +GenericConstraintFn make_constraint(const GenericConstraintFn &f) __attribute__ ((unused)); +GenericConstraintFn make_constraint(const GenericConstraintFn &f) +{ + return f; +} + +template +GenericConstraintFn make_constraint(const GenericConstraintFn &f, Fs ... fs) +{ + return [f, fs...] (const DepthwiseArgs &args, const void *os) -> bool { + return f(args, os) && make_constraint(fs...)(args, os); + }; +} + +template +ConstraintFn constraint(Fs ... fs) +{ + return [fs...] (const DepthwiseArgs &args, const OutputStage &os) -> bool { + return make_constraint(fs...)(args, &os); + }; +} + +// Some useful constraints +template +bool is_supported(const DepthwiseArgs &args, const void *) +{ + return ((args.kernel_rows == Strategy::kernel_rows) && + (args.kernel_cols == Strategy::kernel_cols) && + (args.stride_rows == Strategy::stride_rows) && + (args.stride_cols == Strategy::stride_cols)); +} + +bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) __attribute__ ((unused)); +bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) +{ + return args.cpu_info->has_dotprod(); +} + +bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused)); +bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) +{ + return args.channel_multiplier == 1; +} + +bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused)); +bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp) +{ + const auto qp = static_cast(_qp); + return qp->per_channel_requant ? + (qp->per_channel_left_shifts == nullptr) : + (qp->per_layer_left_shift == 0); +} + +} // namespace +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp new file mode 100644 index 0000000000..40370fe59e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm_local.hpp" + +#include "depthwise_implementation.hpp" +#include "depthwise_depthfirst_quantized.hpp" +#include "depthwise_depthfirst_generic_quantized.hpp" +#include "depthwise_depthfirst_multiplier_quantized.hpp" +#include "depthwise_depthfirst_generic_multiplier_quantized.hpp" + +#include "depthwise_implementation_constraints.hpp" + +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(SVE2) +#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" +#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" +#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" +#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" +#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2) +#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" +#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" +#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp" +#include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" +#include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" +#include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(__aarch64__) + +#include + +using arm_gemm::Requantize32; + +namespace arm_conv { +namespace depthwise { + +namespace +{ + +bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp) +{ + const auto qp = static_cast(_qp); + return qp->b_offset == 0; +} + +} + +static const DepthwiseImplementation depthwise_s8q_methods[] = { +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(SVE2) + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift, + qp_weights_are_symmetric), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", + constraint(is_supported, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", + constraint(is_supported, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, +#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2) + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_weights_are_symmetric, + qp_has_no_left_shift, + cpu_has_dot_product), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift, + cpu_has_dot_product), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_nhwc_generic_output3x3_mla_depthfirst", + constraint(has_no_channel_multiplier), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", + constraint(is_supported, + qp_has_no_left_shift, + cpu_has_dot_product), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", + constraint(is_supported, + qp_has_no_left_shift, + cpu_has_dot_product), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + nullptr, + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericWithMultiplierQuantized(args, qp); + }, + }, +#endif // defined(__aarch64__) + { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list +}; + +template <> +const DepthwiseImplementation *depthwise_implementation_list() +{ + return depthwise_s8q_methods; +} + +template UniqueDepthwiseCommon depthwise(const DepthwiseArgs &, const Requantize32 &); +template std::vector get_compatible_kernels(const DepthwiseArgs &, const Requantize32 &); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp new file mode 100644 index 0000000000..3e190d242a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm_local.hpp" + +#include "depthwise_implementation.hpp" +#include "depthwise_depthfirst_quantized.hpp" +#include "depthwise_depthfirst_generic_quantized.hpp" +#include "depthwise_depthfirst_multiplier_quantized.hpp" +#include "depthwise_depthfirst_generic_multiplier_quantized.hpp" + +#include "depthwise_implementation_constraints.hpp" + +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(SVE2) +#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" +#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" +#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" +#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2) +#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" +#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp" +#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" +#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" +#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(__aarch64__) + +#include + +using arm_gemm::Requantize32; + +namespace arm_conv { +namespace depthwise { + +static const DepthwiseImplementation depthwise_u8q_methods[] = { +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(SVE2) + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", + constraint(is_supported, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", + constraint(is_supported, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, +#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2) + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst", + constraint(is_supported, + cpu_has_dot_product, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_nhwc_generic_output3x3_mla_depthfirst", + constraint(has_no_channel_multiplier), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", + constraint(is_supported, + cpu_has_dot_product, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", + constraint(is_supported, + cpu_has_dot_product, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + nullptr, + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericWithMultiplierQuantized(args, qp); + }, + }, +#endif // defined(__aarch64__) + { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list +}; + +template <> +const DepthwiseImplementation *depthwise_implementation_list() +{ + return depthwise_u8q_methods; +} + +template UniqueDepthwiseCommon depthwise(const DepthwiseArgs &, const Requantize32 &); +template std::vector get_compatible_kernels(const DepthwiseArgs &, const Requantize32 &); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp new file mode 100644 index 0000000000..537a7c5e01 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm_local.hpp" + +#include "depthwise_implementation.hpp" +#include "depthwise_depthfirst_quantized.hpp" +#include "depthwise_depthfirst_generic_quantized.hpp" +#include "depthwise_depthfirst_multiplier_quantized.hpp" +#include "depthwise_depthfirst_generic_multiplier_quantized.hpp" + +#include "depthwise_implementation_constraints.hpp" + +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(SVE2) +#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2) +#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp" +#include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(__aarch64__) + +#include + +using arm_gemm::Requantize32; + +namespace arm_conv { +namespace depthwise { + +static const DepthwiseImplementation depthwise_u8q_methods[] = { +#if defined(__aarch64__) +#if defined(__ARM_FEATURE_SVE) && defined(SVE2) + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, +#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2) + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst", + constraint(has_no_channel_multiplier), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericQuantized(args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + nullptr, + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + return new DepthwiseDepthfirstGenericWithMultiplierQuantized(args, qp); + }, + }, +#endif // defined(__aarch64__) + { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list +}; + +template <> +const DepthwiseImplementation *depthwise_implementation_list() +{ + return depthwise_u8q_methods; +} + +template UniqueDepthwiseCommon depthwise(const DepthwiseArgs &, const Requantize32 &); +template std::vector get_compatible_kernels(const DepthwiseArgs &, const Requantize32 &); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp new file mode 100644 index 0000000000..6c5ef23684 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include +#include + +using namespace arm_gemm; + +size_t generic_get_packed_size( + const VLType vec_type, + const unsigned int acc_depth, + const unsigned int kernel_rows, + const unsigned int kernel_cols, + const unsigned int n_input_channels +) +{ + const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length(vec_type); + return arm_gemm::roundup((long unsigned int) n_input_channels, per_iter) * kernel_rows * kernel_cols * sizeof(int8_t); +} + +void generic_pack( + const VLType vec_type, + const unsigned int acc_depth, + const unsigned int kernel_rows, + const unsigned int kernel_cols, + const unsigned int n_channels, + void *_outptr, + const void *_weights, + size_t ld_weight_col, + size_t ld_weight_row +) +{ + int8_t *outptr = reinterpret_cast(_outptr); + const int8_t *weights = reinterpret_cast(_weights); + + // Get the strides + ld_weight_col = (ld_weight_col == 0) ? n_channels * sizeof(int8_t) : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? kernel_cols * ld_weight_col : ld_weight_row; + + // Pack into per-iter chunks. + const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length(vec_type); + for (unsigned int c = 0; c < n_channels; c += per_iter) + { + auto weight_row = weights + c; + const auto to_copy = std::min(per_iter, n_channels - c); + + for (unsigned int i = 0; i < kernel_rows; i++) + { + auto weight_col = weight_row; + + for (unsigned int j = 0; j < kernel_cols; j++) + { + memcpy(outptr, weight_col, to_copy); + outptr += per_iter; + weight_col += ld_weight_col; + } + + weight_row += ld_weight_row; + } + } +} + + +#define ADD_IMPLEMENTATION(ARCH, TYPENAME, TYPE, VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS) \ +struct interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla \ +{ \ + static size_t get_packed_size(const DepthwiseArgs &args); \ + static void pack_parameters( \ + unsigned int n_channels, void *outptr, \ + const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row \ + ); \ +}; \ +\ +size_t interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::get_packed_size(const DepthwiseArgs &args) \ +{ \ + return generic_get_packed_size(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, args.input_channels); \ +} \ +\ +void interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::pack_parameters(unsigned int n_channels, void *outptr, \ + const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row) \ +{ \ + generic_pack(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, n_channels, outptr, weights, ld_weight_col, ld_weight_row); \ +} + + +namespace arm_conv { +namespace depthwise { + +#if defined(__ARM_FEATURE_SVE) + +ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 3, 3) +ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 5, 5) +ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 3, 3) +ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 5, 5) + +#endif // defined(__ARM_FEATURE_SVE) + +ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 3, 3) +ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 5, 5) +ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 3, 3) +ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 5, 5) + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp new file mode 100644 index 0000000000..3d3447bf3c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__aarch64__) + +#include "arm_gemm.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +struct interleave_a64_s8q_3x3_dot +{ + static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row); +}; + +size_t interleave_a64_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args) +{ + // We store 7 vectors for every of channels. + const unsigned int n = arm_gemm::roundup( + arm_gemm::iceildiv((long unsigned int) args.input_channels, + get_vector_length(arm_gemm::VLType::None)), 4lu + ); + return n * 7 * get_vector_length(arm_gemm::VLType::None); +} + +void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row) +{ + __asm__ __volatile__( + "movi v0.16b, #0x0\n" + "cmp %x[ld_weight_col], XZR\n" + "movi v31.16b, #0x1\n" + "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n" + "movi v16.4s, #0x9\n" + "mov x19, #0x3\n" + "cmp %x[ld_weight_row], XZR\n" + "mul x19, %x[ld_weight_col], x19\n" + "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n" + "add x24, %x[weights], %x[ld_weight_row]\n" + "add x23, x24, %x[ld_weight_row]\n" + "add x22, %x[ld_weight_col], %x[ld_weight_col]\n" + "lsr x20, %x[n_channels], #0x2\n" + "mov x21, #0x0\n" + "add x19, %x[qp], %[offsetof_input_offset]\n" + "ld1r { v30.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_weights_offset]\n" + "ld1r { v29.4s }, [x19]\n" + "mul v29.4s, v29.4s, v30.4s\n" + "add x19, %x[qp], %[offsetof_per_layer_mul]\n" + "ld1r { v28.4s }, [x19]\n" + "mul v29.4s, v29.4s, v16.4s\n" + "add x19, %x[qp], %[offsetof_per_layer_right_shift]\n" + "ld1r { v27.4s }, [x19]\n" + "cbz x20, 4f\n" + "1:" // Loop + "movi v26.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "ldr q26, [%x[bias], x21]\n" + "2:" // Loop: Skip bias load + "movi v25.4s, #0x0\n" + "ldr s24, [%x[weights], #0x0]\n" + "ldr s23, [%x[weights], %x[ld_weight_col]]\n" + "zip1 v23.16b, v23.16b, v0.16b\n" + "ldr s21, [%x[weights], x22]\n" + "add %x[weights], %x[weights], #0x4\n" + "zip1 v21.16b, v24.16b, v21.16b\n" + "ldr s22, [x24, #0x0]\n" + "ldr s20, [x24, %x[ld_weight_col]]\n" + "zip1 v21.16b, v21.16b, v23.16b\n" + "ldr s18, [x24, x22]\n" + ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n" + "add x24, x24, #0x4\n" + "zip1 v20.16b, v20.16b, v0.16b\n" + "ldr s19, [x23, #0x0]\n" + "ldr s17, [x23, %x[ld_weight_col]]\n" + "zip1 v18.16b, v22.16b, v18.16b\n" + "ldr s16, [x23, x22]\n" + "zip1 v18.16b, v18.16b, v20.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x4e9297f9 // sdot v25.4s, v31.16b, v18.16b\n" + "zip1 v17.16b, v17.16b, v0.16b\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "zip1 v16.16b, v16.16b, v17.16b\n" + ".inst 0x4e9097f9 // sdot v25.4s, v31.16b, v16.16b\n" + "mls v26.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "str q26, [%x[outptr], #0x0]\n" + "str q21, [%x[outptr], #0x10]\n" + "str q18, [%x[outptr], #0x20]\n" + "str q16, [%x[outptr], #0x30]\n" + "add %x[outptr], %x[outptr], #0x40\n" + "cbz %x[rq_mul_perchannel], 3f\n" + "ldr q28, [%x[rq_mul_perchannel], x21]\n" + "ldr q27, [%x[rq_shift_perchannel], x21]\n" + "3:" // Loop: Quantisation parameters: Store + "str q28, [%x[outptr], #0x0]\n" + "add x21, x21, #0x10\n" + "str q27, [%x[outptr], #0x10]\n" + "subs x20, x20, #0x1\n" + "add %x[outptr], %x[outptr], #0x20\n" + "bgt 1b\n" + "tst %x[n_channels], #0x3\n" + "beq 13f\n" + "4:" // Oddments + "movi v26.4s, #0x0\n" + "cbz %x[bias], 7f\n" + "add %x[bias], %x[bias], x21\n" + "tbz %x[n_channels], #1, 5f\n" + "ld1 { v26.d }[0], [%x[bias]], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v26.s }[2], [%x[bias]], #0x4\n" + "b 6f\n" + "5:" // Oddments: Load bias: Bit 1: Unset + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v26.s }[0], [%x[bias]], #0x4\n" + "6:" // Oddments: Load bias: Bit 1: End + + "7:" // Oddments: Skip bias load + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v24.h }[0], [%x[weights]]\n" + "ld1 { v22.h }[0], [x24]\n" + "add x20, %x[weights], %x[ld_weight_col]\n" + "ld1 { v19.h }[0], [x23]\n" + "add x19, %x[weights], x22\n" + "ld1 { v23.h }[0], [x20]\n" + "add %x[weights], %x[weights], #0x2\n" + "ld1 { v21.h }[0], [x19]\n" + "add x20, x24, %x[ld_weight_col]\n" + "add x19, x24, x22\n" + "ld1 { v20.h }[0], [x20]\n" + "ld1 { v18.h }[0], [x19]\n" + "add x24, x24, #0x2\n" + "add x19, x23, %x[ld_weight_col]\n" + "ld1 { v17.h }[0], [x19]\n" + "add x19, x23, x22\n" + "ld1 { v16.h }[0], [x19]\n" + "add x23, x23, #0x2\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v24.b }[2], [%x[weights]]\n" + "ld1 { v22.b }[2], [x24]\n" + "add x20, %x[weights], %x[ld_weight_col]\n" + "ld1 { v19.b }[2], [x23]\n" + "add x19, %x[weights], x22\n" + "ld1 { v23.b }[2], [x20]\n" + "add %x[weights], %x[weights], #0x1\n" + "ld1 { v21.b }[2], [x19]\n" + "add x20, x24, %x[ld_weight_col]\n" + "add x19, x24, x22\n" + "ld1 { v20.b }[2], [x20]\n" + "ld1 { v18.b }[2], [x19]\n" + "add x20, x23, %x[ld_weight_col]\n" + "add x19, x23, x22\n" + "ld1 { v17.b }[2], [x20]\n" + "ld1 { v16.b }[2], [x19]\n" + "b 9f\n" + "8:" // Oddments: Load weights: Bit 1: Unset + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v24.b }[0], [%x[weights]]\n" + "ld1 { v22.b }[0], [x24]\n" + "add x20, %x[weights], %x[ld_weight_col]\n" + "ld1 { v19.b }[0], [x23]\n" + "add x19, %x[weights], x22\n" + "ld1 { v23.b }[0], [x20]\n" + "add %x[weights], %x[weights], #0x1\n" + "ld1 { v21.b }[0], [x19]\n" + "add x20, x24, %x[ld_weight_col]\n" + "add x19, x24, x22\n" + "ld1 { v20.b }[0], [x20]\n" + "ld1 { v18.b }[0], [x19]\n" + "add x20, x23, %x[ld_weight_col]\n" + "add x19, x23, x22\n" + "ld1 { v17.b }[0], [x20]\n" + "ld1 { v16.b }[0], [x19]\n" + "9:" // Oddments: Load weights: Bit 1: End + "zip1 v21.16b, v24.16b, v21.16b\n" + "zip1 v23.16b, v23.16b, v0.16b\n" + "zip1 v18.16b, v22.16b, v18.16b\n" + "zip1 v20.16b, v20.16b, v0.16b\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "zip1 v17.16b, v17.16b, v0.16b\n" + "zip1 v21.16b, v21.16b, v23.16b\n" + "zip1 v18.16b, v18.16b, v20.16b\n" + "zip1 v16.16b, v16.16b, v17.16b\n" + "movi v25.4s, #0x0\n" + ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n" + ".inst 0x4e9297f9 // sdot v25.4s, v31.16b, v18.16b\n" + ".inst 0x4e9097f9 // sdot v25.4s, v31.16b, v16.16b\n" + "mls v26.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "str q26, [%x[outptr], #0x0]\n" + "str q21, [%x[outptr], #0x10]\n" + "str q18, [%x[outptr], #0x20]\n" + "str q16, [%x[outptr], #0x30]\n" + "add %x[outptr], %x[outptr], #0x40\n" + "cbz %x[rq_mul_perchannel], 12f\n" + "add x20, %x[rq_mul_perchannel], x21\n" + "add x19, %x[rq_shift_perchannel], x21\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v28.d }[0], [x20], #0x8\n" + "ld1 { v27.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v28.s }[2], [x20], #0x4\n" + "ld1 { v27.s }[2], [x19], #0x4\n" + "b 11f\n" + "10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v28.s }[0], [x20], #0x4\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End + + "12:" // Oddments: Quantisation parameters: Store + "str q28, [%x[outptr], #0x0]\n" + "str q27, [%x[outptr], #0x10]\n" + "add %x[outptr], %x[outptr], #0x20\n" + "13:" // End + + : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights) + : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp new file mode 100644 index 0000000000..a725dcab59 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__aarch64__) + +#include "arm_gemm.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +struct interleave_a64_u8q_3x3_dot +{ + static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row); +}; + +size_t interleave_a64_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args) +{ + // We store 7 vectors for every of channels. + const unsigned int n = arm_gemm::roundup( + arm_gemm::iceildiv((long unsigned int) args.input_channels, + get_vector_length(arm_gemm::VLType::None)), 4lu + ); + return n * 7 * get_vector_length(arm_gemm::VLType::None); +} + +void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row) +{ + __asm__ __volatile__( + "movi v0.16b, #0x0\n" + "cmp %x[ld_weight_col], XZR\n" + "movi v31.16b, #0x1\n" + "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n" + "movi v16.4s, #0x9\n" + "mov x19, #0x3\n" + "cmp %x[ld_weight_row], XZR\n" + "mul x19, %x[ld_weight_col], x19\n" + "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n" + "add x24, %x[weights], %x[ld_weight_row]\n" + "add x23, x24, %x[ld_weight_row]\n" + "add x22, %x[ld_weight_col], %x[ld_weight_col]\n" + "lsr x20, %x[n_channels], #0x2\n" + "mov x21, #0x0\n" + "add x19, %x[qp], %[offsetof_input_offset]\n" + "ld1r { v30.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_weights_offset]\n" + "ld1r { v29.4s }, [x19]\n" + "mul v29.4s, v29.4s, v30.4s\n" + "add x19, %x[qp], %[offsetof_per_layer_mul]\n" + "ld1r { v28.4s }, [x19]\n" + "mul v29.4s, v29.4s, v16.4s\n" + "add x19, %x[qp], %[offsetof_per_layer_right_shift]\n" + "ld1r { v27.4s }, [x19]\n" + "cbz x20, 4f\n" + "1:" // Loop + "movi v26.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "ldr q26, [%x[bias], x21]\n" + "2:" // Loop: Skip bias load + "movi v25.4s, #0x0\n" + "ldr s24, [%x[weights], #0x0]\n" + "ldr s23, [%x[weights], %x[ld_weight_col]]\n" + "zip1 v23.16b, v23.16b, v0.16b\n" + "ldr s21, [%x[weights], x22]\n" + "add %x[weights], %x[weights], #0x4\n" + "zip1 v21.16b, v24.16b, v21.16b\n" + "ldr s22, [x24, #0x0]\n" + "ldr s20, [x24, %x[ld_weight_col]]\n" + "zip1 v21.16b, v21.16b, v23.16b\n" + "ldr s18, [x24, x22]\n" + ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n" + "add x24, x24, #0x4\n" + "zip1 v20.16b, v20.16b, v0.16b\n" + "ldr s19, [x23, #0x0]\n" + "ldr s17, [x23, %x[ld_weight_col]]\n" + "zip1 v18.16b, v22.16b, v18.16b\n" + "ldr s16, [x23, x22]\n" + "zip1 v18.16b, v18.16b, v20.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x6e9297f9 // udot v25.4s, v31.16b, v18.16b\n" + "zip1 v17.16b, v17.16b, v0.16b\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "zip1 v16.16b, v16.16b, v17.16b\n" + ".inst 0x6e9097f9 // udot v25.4s, v31.16b, v16.16b\n" + "mls v26.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "str q26, [%x[outptr], #0x0]\n" + "str q21, [%x[outptr], #0x10]\n" + "str q18, [%x[outptr], #0x20]\n" + "str q16, [%x[outptr], #0x30]\n" + "add %x[outptr], %x[outptr], #0x40\n" + "cbz %x[rq_mul_perchannel], 3f\n" + "ldr q28, [%x[rq_mul_perchannel], x21]\n" + "ldr q27, [%x[rq_shift_perchannel], x21]\n" + "3:" // Loop: Quantisation parameters: Store + "str q28, [%x[outptr], #0x0]\n" + "add x21, x21, #0x10\n" + "str q27, [%x[outptr], #0x10]\n" + "subs x20, x20, #0x1\n" + "add %x[outptr], %x[outptr], #0x20\n" + "bgt 1b\n" + "tst %x[n_channels], #0x3\n" + "beq 13f\n" + "4:" // Oddments + "movi v26.4s, #0x0\n" + "cbz %x[bias], 7f\n" + "add %x[bias], %x[bias], x21\n" + "tbz %x[n_channels], #1, 5f\n" + "ld1 { v26.d }[0], [%x[bias]], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v26.s }[2], [%x[bias]], #0x4\n" + "b 6f\n" + "5:" // Oddments: Load bias: Bit 1: Unset + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v26.s }[0], [%x[bias]], #0x4\n" + "6:" // Oddments: Load bias: Bit 1: End + + "7:" // Oddments: Skip bias load + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v24.h }[0], [%x[weights]]\n" + "ld1 { v22.h }[0], [x24]\n" + "add x20, %x[weights], %x[ld_weight_col]\n" + "ld1 { v19.h }[0], [x23]\n" + "add x19, %x[weights], x22\n" + "ld1 { v23.h }[0], [x20]\n" + "add %x[weights], %x[weights], #0x2\n" + "ld1 { v21.h }[0], [x19]\n" + "add x20, x24, %x[ld_weight_col]\n" + "add x19, x24, x22\n" + "ld1 { v20.h }[0], [x20]\n" + "ld1 { v18.h }[0], [x19]\n" + "add x24, x24, #0x2\n" + "add x19, x23, %x[ld_weight_col]\n" + "ld1 { v17.h }[0], [x19]\n" + "add x19, x23, x22\n" + "ld1 { v16.h }[0], [x19]\n" + "add x23, x23, #0x2\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v24.b }[2], [%x[weights]]\n" + "ld1 { v22.b }[2], [x24]\n" + "add x20, %x[weights], %x[ld_weight_col]\n" + "ld1 { v19.b }[2], [x23]\n" + "add x19, %x[weights], x22\n" + "ld1 { v23.b }[2], [x20]\n" + "add %x[weights], %x[weights], #0x1\n" + "ld1 { v21.b }[2], [x19]\n" + "add x20, x24, %x[ld_weight_col]\n" + "add x19, x24, x22\n" + "ld1 { v20.b }[2], [x20]\n" + "ld1 { v18.b }[2], [x19]\n" + "add x20, x23, %x[ld_weight_col]\n" + "add x19, x23, x22\n" + "ld1 { v17.b }[2], [x20]\n" + "ld1 { v16.b }[2], [x19]\n" + "b 9f\n" + "8:" // Oddments: Load weights: Bit 1: Unset + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v24.b }[0], [%x[weights]]\n" + "ld1 { v22.b }[0], [x24]\n" + "add x20, %x[weights], %x[ld_weight_col]\n" + "ld1 { v19.b }[0], [x23]\n" + "add x19, %x[weights], x22\n" + "ld1 { v23.b }[0], [x20]\n" + "add %x[weights], %x[weights], #0x1\n" + "ld1 { v21.b }[0], [x19]\n" + "add x20, x24, %x[ld_weight_col]\n" + "add x19, x24, x22\n" + "ld1 { v20.b }[0], [x20]\n" + "ld1 { v18.b }[0], [x19]\n" + "add x20, x23, %x[ld_weight_col]\n" + "add x19, x23, x22\n" + "ld1 { v17.b }[0], [x20]\n" + "ld1 { v16.b }[0], [x19]\n" + "9:" // Oddments: Load weights: Bit 1: End + "zip1 v21.16b, v24.16b, v21.16b\n" + "zip1 v23.16b, v23.16b, v0.16b\n" + "zip1 v18.16b, v22.16b, v18.16b\n" + "zip1 v20.16b, v20.16b, v0.16b\n" + "zip1 v16.16b, v19.16b, v16.16b\n" + "zip1 v17.16b, v17.16b, v0.16b\n" + "zip1 v21.16b, v21.16b, v23.16b\n" + "zip1 v18.16b, v18.16b, v20.16b\n" + "zip1 v16.16b, v16.16b, v17.16b\n" + "movi v25.4s, #0x0\n" + ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n" + ".inst 0x6e9297f9 // udot v25.4s, v31.16b, v18.16b\n" + ".inst 0x6e9097f9 // udot v25.4s, v31.16b, v16.16b\n" + "mls v26.4s, v25.4s, v30.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "str q26, [%x[outptr], #0x0]\n" + "str q21, [%x[outptr], #0x10]\n" + "str q18, [%x[outptr], #0x20]\n" + "str q16, [%x[outptr], #0x30]\n" + "add %x[outptr], %x[outptr], #0x40\n" + "cbz %x[rq_mul_perchannel], 12f\n" + "add x20, %x[rq_mul_perchannel], x21\n" + "add x19, %x[rq_shift_perchannel], x21\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v28.d }[0], [x20], #0x8\n" + "ld1 { v27.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v28.s }[2], [x20], #0x4\n" + "ld1 { v27.s }[2], [x19], #0x4\n" + "b 11f\n" + "10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v28.s }[0], [x20], #0x4\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End + + "12:" // Oddments: Quantisation parameters: Store + "str q28, [%x[outptr], #0x0]\n" + "str q27, [%x[outptr], #0x10]\n" + "add %x[outptr], %x[outptr], #0x20\n" + "13:" // End + + : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights) + : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp new file mode 100644 index 0000000000..41f0495acf --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +namespace arm_conv { +namespace depthwise { + +#if defined(__ARM_FEATURE_SVE) + +class interleave_sve_u8q_3x3_dot +{ + public: + static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_sve_s8q_3x3_dot +{ + public: + static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_sve_u8q_3x3_mla +{ + public: + static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_sve_s8q_3x3_mla +{ + public: + static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_sve_u8q_5x5_mla +{ + public: + static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_sve_s8q_5x5_mla +{ + public: + static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +#endif // defined(__ARM_FEATURE_SVE) + +class interleave_a64_u8q_3x3_dot +{ + public: + static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_a64_s8q_3x3_dot +{ + public: + static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_a64_u8q_3x3_mla +{ + public: + static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_a64_s8q_3x3_mla +{ + public: + static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_a64_u8q_5x5_mla +{ + public: + static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +class interleave_a64_s8q_5x5_mla +{ + public: + static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp new file mode 100644 index 0000000000..ea0c35b7ce --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_SVE) + +#include "arm_gemm.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +struct interleave_sve_s8q_3x3_dot +{ + static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row); +}; + +size_t interleave_sve_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args) +{ + // We store 7 vectors for every of channels. + const unsigned int n = arm_gemm::roundup( + arm_gemm::iceildiv((long unsigned int) args.input_channels, + get_vector_length(arm_gemm::VLType::SVE)), 4lu + ); + return n * 7 * get_vector_length(arm_gemm::VLType::SVE); +} + +void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row) +{ + __asm__ __volatile__( + "mov z30.b, #0x0\n" + "ptrue p2.b\n" + "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n" + "mov z28.b, #0x1\n" + "cmp %x[ld_weight_col], XZR\n" + "mov z16.s, #0x9\n" + "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n" + "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n" + "mul z27.s, p2/M, z27.s, z29.s\n" + "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n" + "mov x19, #0x3\n" + "mul z27.s, p2/M, z27.s, z16.s\n" + "ld1rw { z25.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n" + "mul x19, %x[ld_weight_col], x19\n" + "cmp %x[ld_weight_row], XZR\n" + "add x23, %x[ld_weight_col], %x[ld_weight_col]\n" + "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n" + "add x22, %x[weights], %x[ld_weight_row]\n" + "add x21, x22, %x[ld_weight_row]\n" + "whilelt p1.s, XZR, %x[n_channels]\n" + "mov x20, #0x0\n" + "pfalse p8.b\n" + "cbz %x[bias], 1f\n" + "ptrue p8.s\n" + "1:" // No bias + + "2:" // Loop + "mov z24.s, #0x0\n" + "cntp x19, p2, p1.s\n" + "and p0.b, p2/Z, p8.b, p1.b\n" + "ld1w { z23.s }, p0/Z, [%x[bias], x20, LSL #2]\n" + "whilelt p0.b, XZR, x19\n" + "ld1b { z17.b }, p0/Z, [%x[weights]]\n" + "ld1b { z16.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n" + "zip1 z18.b, z16.b, z30.b\n" + "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n" + "add %x[weights], %x[weights], x19\n" + "zip1 z16.b, z17.b, z16.b\n" + "ld1b { z22.b }, p0/Z, [x22]\n" + "ld1b { z17.b }, p0/Z, [x22, %x[ld_weight_col]]\n" + "zip1 z21.b, z16.b, z18.b\n" + "ld1b { z16.b }, p0/Z, [x22, x23]\n" + "sdot z24.s, z28.b, z21.b\n" + "add x22, x22, x19\n" + "zip1 z18.b, z17.b, z30.b\n" + "ld1b { z20.b }, p0/Z, [x21]\n" + "ld1b { z19.b }, p0/Z, [x21, %x[ld_weight_col]]\n" + "zip1 z17.b, z22.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x21, x23]\n" + "zip1 z18.b, z17.b, z18.b\n" + "add x21, x21, x19\n" + "zip1 z17.b, z19.b, z30.b\n" + "sdot z24.s, z28.b, z18.b\n" + "zip1 z16.b, z20.b, z16.b\n" + "zip1 z16.b, z16.b, z17.b\n" + "sdot z24.s, z28.b, z16.b\n" + "mls z23.s, p2/M, z24.s, z29.s\n" + "add z23.s, z23.s, z27.s\n" + "st1w { z23.s }, p2, [%x[outptr]]\n" + "st1b { z21.b }, p2, [%x[outptr], #1, MUL VL]\n" + "st1b { z18.b }, p2, [%x[outptr], #2, MUL VL]\n" + "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n" + "addvl %x[outptr], %x[outptr], #4\n" + "cbz %x[rq_mul_perchannel], 3f\n" + "ld1w { z26.s }, p1/Z, [%x[rq_mul_perchannel], x20, LSL #2]\n" + "ld1w { z25.s }, p1/Z, [%x[rq_shift_perchannel], x20, LSL #2]\n" + "3:" // Loop: Quantisation parameters: Store + "st1w { z26.s }, p2, [%x[outptr]]\n" + "incw x20\n" + "st1w { z25.s }, p2, [%x[outptr], #1, MUL VL]\n" + "whilelt p1.s, x20, %x[n_channels]\n" + "addvl %x[outptr], %x[outptr], #2\n" + "b.any 2b\n" + : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights) + : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "p0", "p1", "p2", "p8", "x19", "x20", "x21", "x22", "x23", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp new file mode 100644 index 0000000000..edd32a43f5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_SVE) + +#include "arm_gemm.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +struct interleave_sve_u8q_3x3_dot +{ + static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row); +}; + +size_t interleave_sve_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args) +{ + // We store 7 vectors for every of channels. + const unsigned int n = arm_gemm::roundup( + arm_gemm::iceildiv((long unsigned int) args.input_channels, + get_vector_length(arm_gemm::VLType::SVE)), 4lu + ); + return n * 7 * get_vector_length(arm_gemm::VLType::SVE); +} + +void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row) +{ + __asm__ __volatile__( + "mov z30.b, #0x0\n" + "ptrue p2.b\n" + "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n" + "mov z28.b, #0x1\n" + "cmp %x[ld_weight_col], XZR\n" + "mov z16.s, #0x9\n" + "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n" + "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n" + "mul z27.s, p2/M, z27.s, z29.s\n" + "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n" + "mov x19, #0x3\n" + "mul z27.s, p2/M, z27.s, z16.s\n" + "ld1rw { z25.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n" + "mul x19, %x[ld_weight_col], x19\n" + "cmp %x[ld_weight_row], XZR\n" + "add x23, %x[ld_weight_col], %x[ld_weight_col]\n" + "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n" + "add x22, %x[weights], %x[ld_weight_row]\n" + "add x21, x22, %x[ld_weight_row]\n" + "whilelt p1.s, XZR, %x[n_channels]\n" + "mov x20, #0x0\n" + "pfalse p8.b\n" + "cbz %x[bias], 1f\n" + "ptrue p8.s\n" + "1:" // No bias + + "2:" // Loop + "mov z24.s, #0x0\n" + "cntp x19, p2, p1.s\n" + "and p0.b, p2/Z, p8.b, p1.b\n" + "ld1w { z23.s }, p0/Z, [%x[bias], x20, LSL #2]\n" + "whilelt p0.b, XZR, x19\n" + "ld1b { z17.b }, p0/Z, [%x[weights]]\n" + "ld1b { z16.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n" + "zip1 z18.b, z16.b, z30.b\n" + "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n" + "add %x[weights], %x[weights], x19\n" + "zip1 z16.b, z17.b, z16.b\n" + "ld1b { z22.b }, p0/Z, [x22]\n" + "ld1b { z17.b }, p0/Z, [x22, %x[ld_weight_col]]\n" + "zip1 z21.b, z16.b, z18.b\n" + "ld1b { z16.b }, p0/Z, [x22, x23]\n" + "udot z24.s, z28.b, z21.b\n" + "add x22, x22, x19\n" + "zip1 z18.b, z17.b, z30.b\n" + "ld1b { z20.b }, p0/Z, [x21]\n" + "ld1b { z19.b }, p0/Z, [x21, %x[ld_weight_col]]\n" + "zip1 z17.b, z22.b, z16.b\n" + "ld1b { z16.b }, p0/Z, [x21, x23]\n" + "zip1 z18.b, z17.b, z18.b\n" + "add x21, x21, x19\n" + "zip1 z17.b, z19.b, z30.b\n" + "udot z24.s, z28.b, z18.b\n" + "zip1 z16.b, z20.b, z16.b\n" + "zip1 z16.b, z16.b, z17.b\n" + "udot z24.s, z28.b, z16.b\n" + "mls z23.s, p2/M, z24.s, z29.s\n" + "add z23.s, z23.s, z27.s\n" + "st1w { z23.s }, p2, [%x[outptr]]\n" + "st1b { z21.b }, p2, [%x[outptr], #1, MUL VL]\n" + "st1b { z18.b }, p2, [%x[outptr], #2, MUL VL]\n" + "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n" + "addvl %x[outptr], %x[outptr], #4\n" + "cbz %x[rq_mul_perchannel], 3f\n" + "ld1w { z26.s }, p1/Z, [%x[rq_mul_perchannel], x20, LSL #2]\n" + "ld1w { z25.s }, p1/Z, [%x[rq_shift_perchannel], x20, LSL #2]\n" + "3:" // Loop: Quantisation parameters: Store + "st1w { z26.s }, p2, [%x[outptr]]\n" + "incw x20\n" + "st1w { z25.s }, p2, [%x[outptr], #1, MUL VL]\n" + "whilelt p1.s, x20, %x[n_channels]\n" + "addvl %x[outptr], %x[outptr], #2\n" + "b.any 2b\n" + : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights) + : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "p0", "p1", "p2", "p8", "x19", "x20", "x21", "x22", "x23", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..bb43d57018 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + + a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..99f46015aa --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x17, #0x0\n" + "mov x16, #0x0\n" + "1:" // Tile loop + "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x25, #0x2\n" + "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x15, #0x2\n" + "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x22, #0x0\n" + "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x17, x23\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col + "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x25\n" // offset *= kernel_stride * output_size + "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x12, x12, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1r { v18.8h }, [x24]\n" + "add x9, x12, x23, LSL #1\n" + "ld1r { v17.8h }, [x21]\n" + "add x28, x9, x23, LSL #1\n" + "lsl x13, x13, #0x1\n" + "add x27, x28, x23, LSL #1\n" + "add x26, x13, x13\n" + "add x25, x26, x13\n" + "mul x19, x17, x20\n" // offset = tile_i * ld_output_row + "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x15\n" // offset *= output_tile_size + "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x24, x10, x20, LSL #1\n" + "lsl x11, x11, #0x1\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x3\n" + "cbz x19, 4f\n" + "ldr q16, [x14, #0x0]\n" + "ldr q0, [x14, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x14, #0x20]\n" + "ldr q2, [x14, #0x30]\n" + "ldr q3, [x14, #0x40]\n" + "ldr q4, [x14, #0x50]\n" + "ldr q5, [x14, #0x60]\n" + "ldr q6, [x14, #0x70]\n" + "ldr q7, [x14, #0x80]\n" + "ldr q8, [x14, #0x90]\n" + "add x14, x14, #0xa0\n" + "ldr q9, [x9, x13]\n" + "ld1 { v10.8h }, [x12]\n" + "ldr q11, [x12, x25]\n" + "ldr q12, [x9, x26]\n" + "ldr q13, [x28, x13]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n" + "add x22, x22, #0x10\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n" + "add x21, x21, #0x10\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ld1 { v9.8h }, [x27]\n" + "cmp x21, x19, LSL #4\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x28, x26]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "ldr q11, [x27, x25]\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "ldr q16, [x14, #0x0]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v31.8h, v5.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v12.8h\n" + "ldr q12, [x12, x13]\n" + "fmla v29.8h, v6.8h, v9.8h\n" + "ldr q9, [x12, x26]\n" + "add x12, x12, #0x10\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "fmla v31.8h, v7.8h, v13.8h\n" + "fmla v30.8h, v6.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "ld1 { v11.8h }, [x9]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x9, x25]\n" + "add x9, x9, #0x10\n" + "fmla v29.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x14, #0x50]\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "ld1 { v9.8h }, [x28]\n" + "ldr q1, [x14, #0x20]\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q0, [x14, #0x10]\n" + "fmla v28.8h, v2.8h, v12.8h\n" + "ldr q2, [x14, #0x30]\n" + "fmla v31.8h, v8.8h, v10.8h\n" + "fmla v30.8h, v7.8h, v10.8h\n" + "ldr q10, [x28, x25]\n" + "add x28, x28, #0x10\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "ldr q13, [x28, x13]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x27, x13]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x27, x26]\n" + "add x27, x27, #0x10\n" + "fmla v28.8h, v5.8h, v10.8h\n" + "ldr q3, [x14, #0x40]\n" + "ldr q5, [x14, #0x60]\n" + "fmla v31.8h, v6.8h, v9.8h\n" + "ldr q9, [x9, x13]\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "ld1 { v10.8h }, [x12]\n" + "fmla v29.8h, v7.8h, v11.8h\n" + "fmla v28.8h, v6.8h, v11.8h\n" + "ldr q11, [x12, x25]\n" + "ldr q6, [x14, #0x70]\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "ldr q8, [x14, #0x90]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "ldr q12, [x9, x26]\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "ldr q7, [x14, #0x80]\n" + "add x14, x14, #0xa0\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "st1 { v31.8h }, [x10]\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q30, [x10, x11]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "st1 { v29.8h }, [x24]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "add x10, x10, #0x10\n" + "str q28, [x24, x11]\n" + "add x24, x24, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ld1 { v9.8h }, [x27]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x28, x26]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "ldr q11, [x27, x25]\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v31.8h, v5.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v12.8h\n" + "ldr q12, [x12, x13]\n" + "fmla v29.8h, v6.8h, v9.8h\n" + "ldr q9, [x12, x26]\n" + "add x12, x12, #0x10\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "fmla v31.8h, v7.8h, v13.8h\n" + "fmla v30.8h, v6.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "ld1 { v11.8h }, [x9]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x9, x25]\n" + "add x9, x9, #0x10\n" + "fmla v29.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "ld1 { v9.8h }, [x28]\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "fmla v28.8h, v2.8h, v12.8h\n" + "fmla v31.8h, v8.8h, v10.8h\n" + "fmla v30.8h, v7.8h, v10.8h\n" + "ldr q10, [x28, x25]\n" + "add x28, x28, #0x10\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x27, x13]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x27, x26]\n" + "add x27, x27, #0x10\n" + "fmla v28.8h, v5.8h, v10.8h\n" + "fmla v31.8h, v6.8h, v9.8h\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "fmla v29.8h, v7.8h, v11.8h\n" + "fmla v28.8h, v6.8h, v11.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "st1 { v31.8h }, [x10]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "str q30, [x10, x11]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "add x10, x10, #0x10\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "st1 { v29.8h }, [x24]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q28, [x24, x11]\n" + "add x24, x24, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x1\n" + "beq 31f\n" + "ldr q16, [x14, #0x0]\n" + "ldr q0, [x14, #0x10]\n" + "add x23, x9, x13\n" + "ldr q1, [x14, #0x20]\n" + "add x22, x12, XZR\n" + "ldr q2, [x14, #0x30]\n" + "add x21, x12, x25\n" + "ldr q3, [x14, #0x40]\n" + "add x20, x9, x26\n" + "ldr q4, [x14, #0x50]\n" + "add x19, x28, x13\n" + "ldr q5, [x14, #0x60]\n" + "ldr q6, [x14, #0x70]\n" + "ldr q7, [x14, #0x80]\n" + "ldr q8, [x14, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr s9, [x23], #0x4\n" + "ldr s10, [x22], #0x4\n" + "ldr s11, [x21], #0x4\n" + "ldr s12, [x20], #0x4\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.h }[2], [x23]\n" + "ld1 { v10.h }[2], [x22]\n" + "ld1 { v11.h }[2], [x21]\n" + "ld1 { v12.h }[2], [x20]\n" + "ld1 { v13.h }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset + "ldr h9, [x23, #0x0]\n" + "ldr h10, [x22, #0x0]\n" + "ldr h11, [x21, #0x0]\n" + "ldr h12, [x20, #0x0]\n" + "ldr h13, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n" + "add x19, x27, XZR\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v31.8h, v5.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v29.8h, v6.8h, v9.8h\n" + "add x19, x27, x25\n" + "fmla v31.8h, v7.8h, v13.8h\n" + "fmla v30.8h, v6.8h, v13.8h\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v28.8h, v8.8h, v11.8h\n" + "add x19, x12, x13\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End + "fmla v31.8h, v1.8h, v12.8h\n" + "add x19, x12, x26\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End + "fmla v31.8h, v2.8h, v9.8h\n" + "add x19, x28, x26\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End + "fmla v31.8h, v8.8h, v10.8h\n" + "add x19, x9, XZR\n" + "fmla v30.8h, v7.8h, v10.8h\n" + "fmla v29.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End + "fmla v31.8h, v3.8h, v11.8h\n" + "add x19, x9, x25\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.8h, v5.8h, v12.8h\n" + "add x19, x28, XZR\n" + "fmla v28.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v9.8h\n" + "add x19, x28, x25\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v30.8h, v8.8h, v10.8h\n" + "add x19, x27, x13\n" + "fmla v28.8h, v5.8h, v10.8h\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v29.8h, v7.8h, v11.8h\n" + "add x19, x27, x26\n" + "fmla v28.8h, v6.8h, v11.8h\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v29.8h, v8.8h, v12.8h\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "tbz %x[n_channels], #1, 29f\n" + "mov x19, x10\n" + "st1 { v31.s }[0], [x19], x11\n" + "add x10, x10, #0x4\n" + "st1 { v30.s }[0], [x19]\n" + "mov x19, x24\n" + "st1 { v29.s }[0], [x19], x11\n" + "add x24, x24, #0x4\n" + "st1 { v28.s }[0], [x19]\n" + "tbz %x[n_channels], #0, 30f\n" + "mov x20, x10\n" + "st1 { v31.h }[2], [x20], x11\n" + "mov x19, x24\n" + "st1 { v30.h }[2], [x20]\n" + "st1 { v29.h }[2], [x19], x11\n" + "st1 { v28.h }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x20, x10\n" + "st1 { v31.h }[0], [x20], x11\n" + "mov x19, x24\n" + "st1 { v30.h }[0], [x20]\n" + "st1 { v29.h }[0], [x19], x11\n" + "st1 { v28.h }[0], [x19]\n" + "30:" // Tile loop: Oddments: Store: Bit 1: End + + "31:" // Tile loop: End + "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x17, #0x1\n" + "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x16, x16, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x16, x19\n" + "csel x16, x16, XZR, LT\n" + "csel x17, x17, x21, LT\n" + "cmp x17, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..af83238d2e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[16]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[5]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[3]; + inptrs[3] = input_ptrs[6]; + inptrs[4] = input_ptrs[9]; + inptrs[5] = input_ptrs[12]; + inptrs[6] = input_ptrs[15]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[2]; + inptrs[9] = input_ptrs[10]; + inptrs[10] = input_ptrs[4]; + inptrs[11] = input_ptrs[7]; + inptrs[12] = input_ptrs[8]; + inptrs[13] = input_ptrs[11]; + inptrs[14] = input_ptrs[13]; + inptrs[15] = input_ptrs[14]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v18.8h }, [x20]\n" + "ld1r { v17.8h }, [x19]\n" + "mov x14, #0x0\n" + "ldp x13, x12, [x21, #0x0]\n" + "mov x11, #0x10\n" // cntb _, ALL, #1 + "ldp x10, x9, [x21, #0x10]\n" + "sub x28, XZR, x11\n" + "lsr x27, %x[n_channels], #0x3\n" + "cbz x27, 3f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x11, x27, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldr x22, [x16, #0x20]\n" + "ldr q9, [x26, x14]\n" + "ldr q10, [x25, x14]\n" + "ldr q11, [x24, x14]\n" + "ldr q12, [x23, x14]\n" + "ldr q13, [x22, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n" + "ldr x21, [x16, #0x28]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n" + "ldr x20, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n" + "ldr x19, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ldr q9, [x21, x14]\n" + "ldr x26, [x16, #0x40]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr x25, [x16, #0x48]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "ldr q11, [x20, x14]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q10, [x25, x14]\n" + "ldr x24, [x16, #0x50]\n" + "fmla v31.8h, v5.8h, v12.8h\n" + "ldr x23, [x16, #0x58]\n" + "fmla v30.8h, v4.8h, v12.8h\n" + "fmla v29.8h, v6.8h, v9.8h\n" + "ldr q12, [x19, x14]\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr q9, [x26, x14]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.8h, v7.8h, v13.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.8h, v6.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "ldr x20, [x16, #0x70]\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "ldp x26, x25, [x16, #0x0]\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "fmla v29.8h, v5.8h, v10.8h\n" + "ldr q12, [x23, x14]\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldr q16, [x15, #0x0]\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "ldr q4, [x15, #0x50]\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q9, [x22, x14]\n" + "fmla v28.8h, v2.8h, v12.8h\n" + "ldr x22, [x16, #0x20]\n" + "ldr q0, [x15, #0x10]\n" + "fmla v31.8h, v8.8h, v10.8h\n" + "ldr q1, [x15, #0x20]\n" + "fmla v30.8h, v7.8h, v10.8h\n" + "ldr q10, [x21, x14]\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "ldr q13, [x22, x11]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x20, x14]\n" + "ldr q2, [x15, #0x30]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "fmla v28.8h, v5.8h, v10.8h\n" + "ldr q12, [x19, x14]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v6.8h, v9.8h\n" + "ldr q9, [x26, x11]\n" + "fmla v29.8h, v7.8h, v11.8h\n" + "ldr q3, [x15, #0x40]\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr q10, [x25, x11]\n" + "fmla v28.8h, v6.8h, v11.8h\n" + "ldr q11, [x24, x11]\n" + "ldr q5, [x15, #0x60]\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "ldr q6, [x15, #0x70]\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "ldr q8, [x15, #0x90]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "ldr q12, [x23, x11]\n" + "add x11, x11, #0x10\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "ldr q7, [x15, #0x80]\n" + "cmp x11, x27, LSL #4\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "str q31, [x13, x28]\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "add x15, x15, #0xa0\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q30, [x12, x28]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "str q29, [x10, x28]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q28, [x9, x28]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n" + "ldr x21, [x16, #0x28]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n" + "ldr x20, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n" + "ldr x19, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ldr q9, [x21, x14]\n" + "ldr x26, [x16, #0x40]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr x25, [x16, #0x48]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "ldr q11, [x20, x14]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q10, [x25, x14]\n" + "ldr x24, [x16, #0x50]\n" + "fmla v31.8h, v5.8h, v12.8h\n" + "ldr x23, [x16, #0x58]\n" + "fmla v30.8h, v4.8h, v12.8h\n" + "fmla v29.8h, v6.8h, v9.8h\n" + "ldr q12, [x19, x14]\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr q9, [x26, x14]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.8h, v7.8h, v13.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.8h, v6.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "ldr x20, [x16, #0x70]\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "fmla v29.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "ldr q9, [x22, x14]\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "fmla v28.8h, v2.8h, v12.8h\n" + "fmla v31.8h, v8.8h, v10.8h\n" + "fmla v30.8h, v7.8h, v10.8h\n" + "ldr q10, [x21, x14]\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x20, x14]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x19, x14]\n" + "add x14, x14, #0x10\n" + "fmla v28.8h, v5.8h, v10.8h\n" + "fmla v31.8h, v6.8h, v9.8h\n" + "fmla v29.8h, v7.8h, v11.8h\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "fmla v28.8h, v6.8h, v11.8h\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "str q31, [x13, x28]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "str q30, [x12, x28]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q29, [x10, x28]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q28, [x9, x28]\n" + "3:" // Oddments + "tst %x[n_channels], #0x1\n" + "beq 30f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x28, x14\n" + "ldr q1, [x15, #0x20]\n" + "add x13, x13, x28\n" + "ldr q2, [x15, #0x30]\n" + "add x12, x12, x28\n" + "ldr q3, [x15, #0x40]\n" + "add x10, x10, x28\n" + "ldr q4, [x15, #0x50]\n" + "add x9, x9, x28\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x26, [x16, #0x0]\n" + "ldr x25, [x16, #0x8]\n" + "add x26, x26, x14\n" + "ldr x24, [x16, #0x10]\n" + "ldr x23, [x16, #0x18]\n" + "add x25, x25, x14\n" + "ldr x22, [x16, #0x20]\n" + "add x24, x24, x14\n" + "add x23, x23, x14\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.s }[0], [x26], #0x4\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "ld1 { v13.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.h }[2], [x26], #0x2\n" + "ld1 { v10.h }[2], [x25], #0x2\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "ld1 { v12.h }[2], [x23], #0x2\n" + "ld1 { v13.h }[2], [x22], #0x2\n" + "b 5f\n" + "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset + "ld1 { v9.h }[0], [x26], #0x2\n" + "ld1 { v10.h }[0], [x25], #0x2\n" + "ld1 { v11.h }[0], [x24], #0x2\n" + "ld1 { v12.h }[0], [x23], #0x2\n" + "ld1 { v13.h }[0], [x22], #0x2\n" + "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n" + "ldr x21, [x16, #0x28]\n" + "add x21, x21, x14\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v31.8h, v5.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v9.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v9.h }[2], [x21], #0x2\n" + "b 7f\n" + "6:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v9.h }[0], [x21], #0x2\n" + "7:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v29.8h, v6.8h, v9.8h\n" + "ldr x20, [x16, #0x30]\n" + "fmla v31.8h, v7.8h, v13.8h\n" + "add x20, x20, x14\n" + "fmla v30.8h, v6.8h, v13.8h\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v11.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v11.h }[2], [x20], #0x2\n" + "b 9f\n" + "8:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v11.h }[0], [x20], #0x2\n" + "9:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v28.8h, v8.8h, v11.8h\n" + "ldr x19, [x16, #0x38]\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v12.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v12.h }[2], [x19], #0x2\n" + "b 11f\n" + "10:" // Oddments: Load input (0, 1): Bit 1: Unset + "ld1 { v12.h }[0], [x19], #0x2\n" + "11:" // Oddments: Load input (0, 1): Bit 1: End + "fmla v31.8h, v1.8h, v12.8h\n" + "ldr x26, [x16, #0x40]\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v9.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v9.h }[2], [x26], #0x2\n" + "b 13f\n" + "12:" // Oddments: Load input (0, 2): Bit 1: Unset + "ld1 { v9.h }[0], [x26], #0x2\n" + "13:" // Oddments: Load input (0, 2): Bit 1: End + "fmla v31.8h, v2.8h, v9.8h\n" + "ldr x25, [x16, #0x48]\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v10.h }[2], [x25], #0x2\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 2): Bit 1: Unset + "ld1 { v10.h }[0], [x25], #0x2\n" + "15:" // Oddments: Load input (2, 2): Bit 1: End + "fmla v31.8h, v8.8h, v10.8h\n" + "ldr x24, [x16, #0x50]\n" + "fmla v30.8h, v7.8h, v10.8h\n" + "add x24, x24, x14\n" + "fmla v29.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "b 17f\n" + "16:" // Oddments: Load input (1, 0): Bit 1: Unset + "ld1 { v11.h }[0], [x24], #0x2\n" + "17:" // Oddments: Load input (1, 0): Bit 1: End + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr x23, [x16, #0x58]\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v12.h }[2], [x23], #0x2\n" + "b 19f\n" + "18:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v12.h }[0], [x23], #0x2\n" + "19:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr x22, [x16, #0x60]\n" + "fmla v28.8h, v2.8h, v12.8h\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v9.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v9.h }[2], [x22], #0x2\n" + "b 21f\n" + "20:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v9.h }[0], [x22], #0x2\n" + "21:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v9.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "add x21, x21, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v10.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v10.h }[2], [x21], #0x2\n" + "b 23f\n" + "22:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v10.h }[0], [x21], #0x2\n" + "23:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr x20, [x16, #0x70]\n" + "fmla v28.8h, v5.8h, v10.8h\n" + "add x20, x20, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v11.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v11.h }[2], [x20], #0x2\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v11.h }[0], [x20], #0x2\n" + "25:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v29.8h, v7.8h, v11.8h\n" + "ldr x19, [x16, #0x78]\n" + "fmla v28.8h, v6.8h, v11.8h\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v12.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v12.h }[2], [x19], #0x2\n" + "b 27f\n" + "26:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v12.h }[0], [x19], #0x2\n" + "27:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v29.8h, v8.8h, v12.8h\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "tbz %x[n_channels], #1, 28f\n" + "st1 { v31.s }[0], [x13], #0x4\n" + "st1 { v30.s }[0], [x12], #0x4\n" + "st1 { v29.s }[0], [x10], #0x4\n" + "st1 { v28.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 29f\n" + "st1 { v31.h }[2], [x13], #0x2\n" + "st1 { v30.h }[2], [x12], #0x2\n" + "st1 { v29.h }[2], [x10], #0x2\n" + "st1 { v28.h }[2], [x9], #0x2\n" + "b 29f\n" + "28:" // Oddments: Store: Bit 1: Unset + "st1 { v31.h }[0], [x13], #0x2\n" + "st1 { v30.h }[0], [x12], #0x2\n" + "st1 { v29.h }[0], [x10], #0x2\n" + "st1 { v28.h }[0], [x9], #0x2\n" + "29:" // Oddments: Store: Bit 1: End + + "30:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp new file mode 100644 index 0000000000..90db8703b5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 3; + constexpr static unsigned int output_cols = 3; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + + a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..3bdd544a54 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,829 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x7, #0x0\n" + "mov x8, #0x0\n" + "1:" // Tile loop + "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x26, #0x3\n" + "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x25, #0x3\n" + "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x23, #0x0\n" + "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x7, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col + "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x26\n" // offset *= kernel_stride * output_size + "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x15, x15, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1r { v18.8h }, [x24]\n" + "add x12, x15, x22, LSL #1\n" + "ld1r { v17.8h }, [x21]\n" + "add x11, x12, x22, LSL #1\n" + "lsl x16, x16, #0x1\n" + "add x10, x11, x22, LSL #1\n" + "add x9, x10, x22, LSL #1\n" + "add x28, x16, x16\n" + "add x27, x28, x16\n" + "add x26, x27, x16\n" + "mul x19, x7, x20\n" // offset = tile_i * ld_output_row + "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x25\n" // offset *= output_tile_size + "add x13, x13, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x25, x13, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "lsl x14, x14, #0x1\n" + "add x22, x14, x14\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x3\n" + "cbz x19, 4f\n" + "ldr q16, [x17, #0x0]\n" + "ldr q0, [x17, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x17, #0x20]\n" + "ldr q2, [x17, #0x30]\n" + "ldr q3, [x17, #0x40]\n" + "ldr q4, [x17, #0x50]\n" + "ldr q5, [x17, #0x60]\n" + "ldr q6, [x17, #0x70]\n" + "ldr q7, [x17, #0x80]\n" + "ldr q8, [x17, #0x90]\n" + "add x17, x17, #0xa0\n" + "ldr q9, [x11, x28]\n" + "ld1 { v10.8h }, [x15]\n" + "ldr q11, [x15, x26]\n" + "ld1 { v12.8h }, [x9]\n" + "ldr q13, [x12, x28]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "add x23, x23, #0x10\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "add x21, x21, #0x10\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n" + "cmp x21, x19, LSL #4\n" + "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n" + "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n" + "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n" + "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n" + "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n" + "ldr q16, [x17, #0x0]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x11, x27]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr q11, [x11, x16]\n" + "fmla v25.8h, v6.8h, v12.8h\n" + "ldr q12, [x9, x26]\n" + "fmla v30.8h, v4.8h, v13.8h\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v13.8h\n" + "fmla v28.8h, v2.8h, v13.8h\n" + "fmla v27.8h, v1.8h, v13.8h\n" + "fmla v26.8h, v0.8h, v13.8h\n" + "ldr q13, [x15, x16]\n" + "fmla v23.8h, v8.8h, v12.8h\n" + "ldr q12, [x15, x27]\n" + "fmla v31.8h, v7.8h, v11.8h\n" + "fmla v30.8h, v6.8h, v11.8h\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v27.8h, v3.8h, v11.8h\n" + "fmla v25.8h, v1.8h, v11.8h\n" + "fmla v24.8h, v0.8h, v11.8h\n" + "ld1 { v11.8h }, [x12]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "ldr q13, [x12, x26]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v5.8h, v10.8h\n" + "fmla v26.8h, v4.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "ld1 { v12.8h }, [x10]\n" + "fmla v29.8h, v7.8h, v10.8h\n" + "fmla v24.8h, v2.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr q10, [x10, x28]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q11, [x10, x26]\n" + "fmla v29.8h, v5.8h, v13.8h\n" + "fmla v26.8h, v2.8h, v13.8h\n" + "ldr q13, [x9, x16]\n" + "fmla v25.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr q12, [x12, x16]\n" + "fmla v27.8h, v7.8h, v10.8h\n" + "fmla v26.8h, v6.8h, v10.8h\n" + "fmla v25.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v8.8h, v10.8h\n" + "fmla v24.8h, v4.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v26.8h, v8.8h, v11.8h\n" + "fmla v25.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v6.8h, v13.8h\n" + "ldr q13, [x9, x27]\n" + "fmla v23.8h, v5.8h, v11.8h\n" + "ldr q11, [x12, x27]\n" + "add x12, x12, #0x10\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v0.8h, v12.8h\n" + "ldr q12, [x10, x16]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "fmla v30.8h, v5.8h, v11.8h\n" + "fmla v26.8h, v1.8h, v11.8h\n" + "fmla v27.8h, v2.8h, v11.8h\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v24.8h, v8.8h, v13.8h\n" + "ld1 { v10.8h }, [x15]\n" + "fmla v23.8h, v7.8h, v13.8h\n" + "ldr q13, [x10, x27]\n" + "add x10, x10, #0x10\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v27.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "fmla v24.8h, v3.8h, v12.8h\n" + "ld1 { v12.8h }, [x11]\n" + "fmla v31.8h, v2.8h, v11.8h\n" + "fmla v30.8h, v1.8h, v11.8h\n" + "ldr q1, [x17, #0x20]\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x11, x26]\n" + "add x11, x11, #0x10\n" + "fmla v27.8h, v8.8h, v13.8h\n" + "ldr q9, [x11, x28]\n" + "fmla v26.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v5.8h, v13.8h\n" + "fmla v23.8h, v4.8h, v13.8h\n" + "ldr q13, [x9, x28]\n" + "add x9, x9, #0x10\n" + "fmla v31.8h, v6.8h, v12.8h\n" + "ldr q4, [x17, #0x50]\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr q3, [x17, #0x40]\n" + "fmla v25.8h, v0.8h, v12.8h\n" + "ld1 { v12.8h }, [x9]\n" + "fmla v29.8h, v8.8h, v11.8h\n" + "ldr q0, [x17, #0x10]\n" + "fmla v26.8h, v5.8h, v11.8h\n" + "ldr q5, [x17, #0x60]\n" + "fmla v23.8h, v2.8h, v11.8h\n" + "ldr q11, [x15, x26]\n" + "fmla v25.8h, v8.8h, v13.8h\n" + "ldr q2, [x17, #0x30]\n" + "fmla v24.8h, v7.8h, v13.8h\n" + "ldr q7, [x17, #0x80]\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "ldr q8, [x17, #0x90]\n" + "fmla v23.8h, v6.8h, v13.8h\n" + "ldr q13, [x12, x28]\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "ldr q6, [x17, #0x70]\n" + "add x17, x17, #0xa0\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "st1 { v31.8h }, [x13]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "str q30, [x13, x14]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q29, [x13, x22]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "add x13, x13, #0x10\n" + "fmax v27.8h, v27.8h, v18.8h\n" + "st1 { v28.8h }, [x25]\n" + "fmax v26.8h, v26.8h, v18.8h\n" + "fmax v25.8h, v25.8h, v18.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "str q27, [x25, x14]\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "str q26, [x25, x22]\n" + "fmax v24.8h, v24.8h, v18.8h\n" + "add x25, x25, #0x10\n" + "fmax v23.8h, v23.8h, v18.8h\n" + "st1 { v25.8h }, [x24]\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "str q24, [x24, x14]\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "str q23, [x24, x22]\n" + "add x24, x24, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n" + "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n" + "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n" + "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n" + "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n" + "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x11, x27]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr q11, [x11, x16]\n" + "fmla v25.8h, v6.8h, v12.8h\n" + "ldr q12, [x9, x26]\n" + "fmla v30.8h, v4.8h, v13.8h\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v13.8h\n" + "fmla v28.8h, v2.8h, v13.8h\n" + "fmla v27.8h, v1.8h, v13.8h\n" + "fmla v26.8h, v0.8h, v13.8h\n" + "ldr q13, [x15, x16]\n" + "fmla v23.8h, v8.8h, v12.8h\n" + "ldr q12, [x15, x27]\n" + "fmla v31.8h, v7.8h, v11.8h\n" + "fmla v30.8h, v6.8h, v11.8h\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v27.8h, v3.8h, v11.8h\n" + "fmla v25.8h, v1.8h, v11.8h\n" + "fmla v24.8h, v0.8h, v11.8h\n" + "ld1 { v11.8h }, [x12]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "ldr q13, [x12, x26]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v5.8h, v10.8h\n" + "fmla v26.8h, v4.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "ld1 { v12.8h }, [x10]\n" + "fmla v29.8h, v7.8h, v10.8h\n" + "fmla v24.8h, v2.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr q10, [x10, x28]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q11, [x10, x26]\n" + "fmla v29.8h, v5.8h, v13.8h\n" + "fmla v26.8h, v2.8h, v13.8h\n" + "ldr q13, [x9, x16]\n" + "fmla v25.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr q12, [x12, x16]\n" + "fmla v27.8h, v7.8h, v10.8h\n" + "fmla v26.8h, v6.8h, v10.8h\n" + "fmla v25.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v8.8h, v10.8h\n" + "fmla v24.8h, v4.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v26.8h, v8.8h, v11.8h\n" + "fmla v25.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v6.8h, v13.8h\n" + "ldr q13, [x9, x27]\n" + "fmla v23.8h, v5.8h, v11.8h\n" + "ldr q11, [x12, x27]\n" + "add x12, x12, #0x10\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v0.8h, v12.8h\n" + "ldr q12, [x10, x16]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "fmla v30.8h, v5.8h, v11.8h\n" + "fmla v26.8h, v1.8h, v11.8h\n" + "fmla v27.8h, v2.8h, v11.8h\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v24.8h, v8.8h, v13.8h\n" + "fmla v23.8h, v7.8h, v13.8h\n" + "ldr q13, [x10, x27]\n" + "add x10, x10, #0x10\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v27.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "fmla v24.8h, v3.8h, v12.8h\n" + "ld1 { v12.8h }, [x11]\n" + "fmla v31.8h, v2.8h, v11.8h\n" + "fmla v30.8h, v1.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x11, x26]\n" + "add x11, x11, #0x10\n" + "fmla v27.8h, v8.8h, v13.8h\n" + "fmla v26.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v5.8h, v13.8h\n" + "fmla v23.8h, v4.8h, v13.8h\n" + "ldr q13, [x9, x28]\n" + "add x9, x9, #0x10\n" + "fmla v31.8h, v6.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v25.8h, v0.8h, v12.8h\n" + "fmla v29.8h, v8.8h, v11.8h\n" + "fmla v26.8h, v5.8h, v11.8h\n" + "fmla v23.8h, v2.8h, v11.8h\n" + "fmla v25.8h, v8.8h, v13.8h\n" + "fmla v24.8h, v7.8h, v13.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmla v23.8h, v6.8h, v13.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "st1 { v31.8h }, [x13]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "str q30, [x13, x14]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q29, [x13, x22]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "add x13, x13, #0x10\n" + "fmax v27.8h, v27.8h, v18.8h\n" + "st1 { v28.8h }, [x25]\n" + "fmax v26.8h, v26.8h, v18.8h\n" + "fmax v25.8h, v25.8h, v18.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "str q27, [x25, x14]\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "str q26, [x25, x22]\n" + "fmax v24.8h, v24.8h, v18.8h\n" + "add x25, x25, #0x10\n" + "fmax v23.8h, v23.8h, v18.8h\n" + "st1 { v25.8h }, [x24]\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "str q24, [x24, x14]\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "str q23, [x24, x22]\n" + "add x24, x24, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x1\n" + "beq 49f\n" + "ldr q16, [x17, #0x0]\n" + "ldr q0, [x17, #0x10]\n" + "add x23, x11, x28\n" + "ldr q1, [x17, #0x20]\n" + "add x22, x15, XZR\n" + "ldr q2, [x17, #0x30]\n" + "add x21, x15, x26\n" + "ldr q3, [x17, #0x40]\n" + "add x20, x9, XZR\n" + "ldr q4, [x17, #0x50]\n" + "add x19, x12, x28\n" + "ldr q5, [x17, #0x60]\n" + "ldr q6, [x17, #0x70]\n" + "ldr q7, [x17, #0x80]\n" + "ldr q8, [x17, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr s9, [x23], #0x4\n" + "ldr s10, [x22], #0x4\n" + "ldr s11, [x21], #0x4\n" + "ldr s12, [x20], #0x4\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.h }[2], [x23]\n" + "ld1 { v10.h }[2], [x22]\n" + "ld1 { v11.h }[2], [x21]\n" + "ld1 { v12.h }[2], [x20]\n" + "ld1 { v13.h }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset + "ldr h9, [x23, #0x0]\n" + "ldr h10, [x22, #0x0]\n" + "ldr h11, [x21, #0x0]\n" + "ldr h12, [x20, #0x0]\n" + "ldr h13, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x19, x9, x26\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n" + "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n" + "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n" + "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n" + "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n" + "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "fmla v25.8h, v6.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v13.8h\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v13.8h\n" + "fmla v28.8h, v2.8h, v13.8h\n" + "fmla v27.8h, v1.8h, v13.8h\n" + "fmla v26.8h, v0.8h, v13.8h\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v23.8h, v8.8h, v12.8h\n" + "add x19, x11, x16\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v31.8h, v7.8h, v11.8h\n" + "add x19, x15, x16\n" + "fmla v30.8h, v6.8h, v11.8h\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v27.8h, v3.8h, v11.8h\n" + "fmla v25.8h, v1.8h, v11.8h\n" + "fmla v24.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End + "fmla v31.8h, v1.8h, v13.8h\n" + "add x19, x15, x27\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End + "fmla v30.8h, v2.8h, v12.8h\n" + "add x19, x11, x27\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v30.8h, v8.8h, v10.8h\n" + "add x19, x12, XZR\n" + "fmla v29.8h, v7.8h, v10.8h\n" + "fmla v27.8h, v5.8h, v10.8h\n" + "fmla v26.8h, v4.8h, v10.8h\n" + "fmla v24.8h, v2.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End + "fmla v31.8h, v3.8h, v11.8h\n" + "add x19, x12, x26\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v29.8h, v5.8h, v13.8h\n" + "add x19, x10, XZR\n" + "fmla v26.8h, v2.8h, v13.8h\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v28.8h, v6.8h, v12.8h\n" + "add x19, x10, x28\n" + "fmla v25.8h, v3.8h, v12.8h\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v28.8h, v8.8h, v10.8h\n" + "add x19, x10, x26\n" + "fmla v27.8h, v7.8h, v10.8h\n" + "fmla v26.8h, v6.8h, v10.8h\n" + "fmla v25.8h, v5.8h, v10.8h\n" + "fmla v24.8h, v4.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v26.8h, v8.8h, v11.8h\n" + "add x19, x9, x16\n" + "fmla v23.8h, v5.8h, v11.8h\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v25.8h, v7.8h, v13.8h\n" + "add x19, x12, x16\n" + "fmla v24.8h, v6.8h, v13.8h\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End + "fmla v31.8h, v4.8h, v12.8h\n" + "add x19, x12, x27\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.8h, v5.8h, v11.8h\n" + "add x19, x9, x27\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "fmla v27.8h, v2.8h, v11.8h\n" + "fmla v26.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v24.8h, v8.8h, v13.8h\n" + "add x19, x10, x16\n" + "fmla v23.8h, v7.8h, v13.8h\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v28.8h, v7.8h, v12.8h\n" + "add x19, x15, x28\n" + "fmla v27.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "fmla v24.8h, v3.8h, v12.8h\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End + "fmla v31.8h, v2.8h, v11.8h\n" + "add x19, x10, x27\n" + "fmla v30.8h, v1.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v27.8h, v8.8h, v13.8h\n" + "add x19, x11, XZR\n" + "fmla v26.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v5.8h, v13.8h\n" + "fmla v23.8h, v4.8h, v13.8h\n" + "tbz %x[n_channels], #1, 41f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 42f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v12.8h\n" + "add x19, x11, x26\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v25.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 43f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 44f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 44f\n" + "43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v29.8h, v8.8h, v11.8h\n" + "add x19, x9, x28\n" + "fmla v26.8h, v5.8h, v11.8h\n" + "fmla v23.8h, v2.8h, v11.8h\n" + "tbz %x[n_channels], #1, 45f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 46f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 46f\n" + "45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v25.8h, v8.8h, v13.8h\n" + "fmla v24.8h, v7.8h, v13.8h\n" + "fmla v23.8h, v6.8h, v13.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "fmax v27.8h, v27.8h, v18.8h\n" + "fmax v26.8h, v26.8h, v18.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmax v25.8h, v25.8h, v18.8h\n" + "fmax v24.8h, v24.8h, v18.8h\n" + "fmax v23.8h, v23.8h, v18.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "tbz %x[n_channels], #1, 47f\n" + "mov x19, x13\n" + "st1 { v31.s }[0], [x19], x14\n" + "add x13, x13, #0x4\n" + "st1 { v30.s }[0], [x19], x14\n" + "mov x20, x25\n" + "st1 { v29.s }[0], [x19]\n" + "st1 { v28.s }[0], [x20], x14\n" + "add x25, x25, #0x4\n" + "st1 { v27.s }[0], [x20], x14\n" + "mov x19, x24\n" + "st1 { v26.s }[0], [x20]\n" + "add x24, x24, #0x4\n" + "st1 { v25.s }[0], [x19], x14\n" + "st1 { v24.s }[0], [x19], x14\n" + "st1 { v23.s }[0], [x19]\n" + "tbz %x[n_channels], #0, 48f\n" + "mov x21, x13\n" + "st1 { v31.h }[2], [x21], x14\n" + "mov x20, x25\n" + "st1 { v30.h }[2], [x21], x14\n" + "st1 { v28.h }[2], [x20], x14\n" + "mov x19, x24\n" + "st1 { v29.h }[2], [x21]\n" + "st1 { v27.h }[2], [x20], x14\n" + "st1 { v26.h }[2], [x20]\n" + "st1 { v25.h }[2], [x19], x14\n" + "st1 { v24.h }[2], [x19], x14\n" + "st1 { v23.h }[2], [x19]\n" + "b 48f\n" + "47:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x21, x13\n" + "st1 { v31.h }[0], [x21], x14\n" + "mov x20, x25\n" + "mov x19, x24\n" + "st1 { v30.h }[0], [x21], x14\n" + "st1 { v28.h }[0], [x20], x14\n" + "st1 { v29.h }[0], [x21]\n" + "st1 { v27.h }[0], [x20], x14\n" + "st1 { v26.h }[0], [x20]\n" + "st1 { v25.h }[0], [x19], x14\n" + "st1 { v24.h }[0], [x19], x14\n" + "st1 { v23.h }[0], [x19]\n" + "48:" // Tile loop: Oddments: Store: Bit 1: End + + "49:" // Tile loop: End + "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x7, #0x1\n" + "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x8, x8, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x8, x19\n" + "csel x8, x8, XZR, LT\n" + "csel x7, x7, x21, LT\n" + "cmp x7, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..ed47c308c4 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,907 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[25]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[4]; + inptrs[3] = input_ptrs[20]; + inptrs[4] = input_ptrs[7]; + inptrs[5] = input_ptrs[24]; + inptrs[6] = input_ptrs[11]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[3]; + inptrs[9] = input_ptrs[13]; + inptrs[10] = input_ptrs[5]; + inptrs[11] = input_ptrs[9]; + inptrs[12] = input_ptrs[15]; + inptrs[13] = input_ptrs[17]; + inptrs[14] = input_ptrs[19]; + inptrs[15] = input_ptrs[21]; + inptrs[16] = input_ptrs[6]; + inptrs[17] = input_ptrs[8]; + inptrs[18] = input_ptrs[23]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[18]; + inptrs[22] = input_ptrs[10]; + inptrs[23] = input_ptrs[14]; + inptrs[24] = input_ptrs[22]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v18.8h }, [x20]\n" + "ld1r { v17.8h }, [x19]\n" + "mov x14, #0x0\n" + "mov x13, #0x10\n" // cntb _, ALL, #1 + "sub x12, XZR, x13\n" + "lsr x11, %x[n_channels], #0x3\n" + "cbz x11, 3f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x13, x11, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x10, x9, [x16, #0x0]\n" + "ldp x28, x27, [x16, #0x10]\n" + "ldr x26, [x16, #0x20]\n" + "ldr q9, [x10, x14]\n" + "ldr q10, [x9, x14]\n" + "ldr q11, [x28, x14]\n" + "ldr q12, [x27, x14]\n" + "ldr q13, [x26, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x25, [x16, #0x28]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "ldr x24, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "ldr x23, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n" + "ldr x10, [x16, #0x40]\n" + "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n" + "ldr x9, [x16, #0x48]\n" + "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n" + "ldr x28, [x16, #0x50]\n" + "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n" + "ldr x27, [x16, #0x58]\n" + "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n" + "ldr x26, [x16, #0x60]\n" + "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n" + "ldr x22, [x17, #0x0]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x9, x14]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v25.8h, v6.8h, v12.8h\n" + "ldr q12, [x25, x14]\n" + "fmla v30.8h, v4.8h, v13.8h\n" + "ldr x25, [x16, #0x68]\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "ldr x24, [x16, #0x70]\n" + "fmla v29.8h, v3.8h, v13.8h\n" + "ldr x9, [x16, #0x88]\n" + "fmla v28.8h, v2.8h, v13.8h\n" + "ldr x21, [x17, #0x8]\n" + "fmla v27.8h, v1.8h, v13.8h\n" + "ldr x20, [x17, #0x10]\n" + "fmla v26.8h, v0.8h, v13.8h\n" + "ldr q13, [x23, x14]\n" + "fmla v23.8h, v8.8h, v12.8h\n" + "ldr q12, [x10, x14]\n" + "fmla v31.8h, v7.8h, v11.8h\n" + "ldr x23, [x16, #0x78]\n" + "fmla v30.8h, v6.8h, v11.8h\n" + "ldr x10, [x16, #0x80]\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "ldr x19, [x17, #0x18]\n" + "fmla v27.8h, v3.8h, v11.8h\n" + "ldr q16, [x15, #0x0]\n" + "fmla v25.8h, v1.8h, v11.8h\n" + "fmla v24.8h, v0.8h, v11.8h\n" + "ldr q11, [x28, x14]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "ldr q13, [x27, x14]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "ldr x27, [x16, #0x98]\n" + "fmla v27.8h, v5.8h, v10.8h\n" + "fmla v26.8h, v4.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "ldr q12, [x26, x14]\n" + "fmla v29.8h, v7.8h, v10.8h\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v24.8h, v2.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr q10, [x25, x14]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v29.8h, v5.8h, v13.8h\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.8h, v2.8h, v13.8h\n" + "ldr q13, [x23, x14]\n" + "fmla v25.8h, v3.8h, v12.8h\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr q12, [x10, x14]\n" + "fmla v27.8h, v7.8h, v10.8h\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v26.8h, v6.8h, v10.8h\n" + "fmla v25.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v8.8h, v10.8h\n" + "fmla v24.8h, v4.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v26.8h, v8.8h, v11.8h\n" + "fmla v25.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v6.8h, v13.8h\n" + "ldr q13, [x28, x14]\n" + "fmla v23.8h, v5.8h, v11.8h\n" + "ldr q11, [x9, x14]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v0.8h, v12.8h\n" + "ldr q12, [x27, x14]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "fmla v30.8h, v5.8h, v11.8h\n" + "fmla v26.8h, v1.8h, v11.8h\n" + "fmla v27.8h, v2.8h, v11.8h\n" + "ldr q11, [x26, x14]\n" + "fmla v24.8h, v8.8h, v13.8h\n" + "ldr x26, [x16, #0x20]\n" + "fmla v23.8h, v7.8h, v13.8h\n" + "ldr q13, [x25, x14]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v27.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "fmla v24.8h, v3.8h, v12.8h\n" + "ldr q12, [x24, x14]\n" + "fmla v31.8h, v2.8h, v11.8h\n" + "fmla v30.8h, v1.8h, v11.8h\n" + "ldr q1, [x15, #0x20]\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x23, x14]\n" + "fmla v27.8h, v8.8h, v13.8h\n" + "fmla v26.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v5.8h, v13.8h\n" + "fmla v23.8h, v4.8h, v13.8h\n" + "ldr q13, [x10, x14]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v6.8h, v12.8h\n" + "ldp x10, x9, [x16, #0x0]\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldp x28, x27, [x16, #0x10]\n" + "fmla v25.8h, v0.8h, v12.8h\n" + "ldr q0, [x15, #0x10]\n" + "fmla v29.8h, v8.8h, v11.8h\n" + "ldr q9, [x10, x13]\n" + "fmla v26.8h, v5.8h, v11.8h\n" + "ldr q10, [x9, x13]\n" + "fmla v23.8h, v2.8h, v11.8h\n" + "ldr q11, [x28, x13]\n" + "fmla v25.8h, v8.8h, v13.8h\n" + "ldr q12, [x27, x13]\n" + "fmla v24.8h, v7.8h, v13.8h\n" + "ldr q2, [x15, #0x30]\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "ldr q3, [x15, #0x40]\n" + "fmla v23.8h, v6.8h, v13.8h\n" + "ldr q13, [x26, x13]\n" + "add x13, x13, #0x10\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "ldr q4, [x15, #0x50]\n" + "cmp x13, x11, LSL #4\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "ldr q5, [x15, #0x60]\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "ldr q6, [x15, #0x70]\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q31, [x22, x12]\n" + "fmax v27.8h, v27.8h, v18.8h\n" + "ldr x22, [x17, #0x20]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "ldr q7, [x15, #0x80]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q30, [x21, x12]\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "str q29, [x20, x12]\n" + "fmax v26.8h, v26.8h, v18.8h\n" + "ldr x21, [x17, #0x28]\n" + "fmax v25.8h, v25.8h, v18.8h\n" + "str q28, [x19, x12]\n" + "fmax v24.8h, v24.8h, v18.8h\n" + "str q27, [x22, x12]\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "ldr x20, [x17, #0x30]\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "ldr x19, [x17, #0x38]\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "str q26, [x21, x12]\n" + "fmax v23.8h, v23.8h, v18.8h\n" + "str q25, [x20, x12]\n" + "ldr x22, [x17, #0x40]\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "str q24, [x19, x12]\n" + "str q23, [x22, x12]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x25, [x16, #0x28]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "ldr x24, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "ldr x23, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n" + "ldr x10, [x16, #0x40]\n" + "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n" + "ldr x9, [x16, #0x48]\n" + "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n" + "ldr x28, [x16, #0x50]\n" + "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n" + "ldr x27, [x16, #0x58]\n" + "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n" + "ldr x26, [x16, #0x60]\n" + "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n" + "ldr x22, [x17, #0x0]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x9, x14]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v25.8h, v6.8h, v12.8h\n" + "ldr q12, [x25, x14]\n" + "fmla v30.8h, v4.8h, v13.8h\n" + "ldr x25, [x16, #0x68]\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "ldr x24, [x16, #0x70]\n" + "fmla v29.8h, v3.8h, v13.8h\n" + "ldr x9, [x16, #0x88]\n" + "fmla v28.8h, v2.8h, v13.8h\n" + "ldr x21, [x17, #0x8]\n" + "fmla v27.8h, v1.8h, v13.8h\n" + "ldr x20, [x17, #0x10]\n" + "fmla v26.8h, v0.8h, v13.8h\n" + "ldr q13, [x23, x14]\n" + "fmla v23.8h, v8.8h, v12.8h\n" + "ldr q12, [x10, x14]\n" + "fmla v31.8h, v7.8h, v11.8h\n" + "ldr x23, [x16, #0x78]\n" + "fmla v30.8h, v6.8h, v11.8h\n" + "ldr x10, [x16, #0x80]\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "ldr x19, [x17, #0x18]\n" + "fmla v27.8h, v3.8h, v11.8h\n" + "fmla v25.8h, v1.8h, v11.8h\n" + "fmla v24.8h, v0.8h, v11.8h\n" + "ldr q11, [x28, x14]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "ldr q13, [x27, x14]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "ldr x27, [x16, #0x98]\n" + "fmla v27.8h, v5.8h, v10.8h\n" + "fmla v26.8h, v4.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "ldr q12, [x26, x14]\n" + "fmla v29.8h, v7.8h, v10.8h\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v24.8h, v2.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr q10, [x25, x14]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v29.8h, v5.8h, v13.8h\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.8h, v2.8h, v13.8h\n" + "ldr q13, [x23, x14]\n" + "fmla v25.8h, v3.8h, v12.8h\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr q12, [x10, x14]\n" + "fmla v27.8h, v7.8h, v10.8h\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v26.8h, v6.8h, v10.8h\n" + "fmla v25.8h, v5.8h, v10.8h\n" + "fmla v28.8h, v8.8h, v10.8h\n" + "fmla v24.8h, v4.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v26.8h, v8.8h, v11.8h\n" + "fmla v25.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v6.8h, v13.8h\n" + "ldr q13, [x28, x14]\n" + "fmla v23.8h, v5.8h, v11.8h\n" + "ldr q11, [x9, x14]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v0.8h, v12.8h\n" + "ldr q12, [x27, x14]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "fmla v30.8h, v5.8h, v11.8h\n" + "fmla v26.8h, v1.8h, v11.8h\n" + "fmla v27.8h, v2.8h, v11.8h\n" + "ldr q11, [x26, x14]\n" + "fmla v24.8h, v8.8h, v13.8h\n" + "fmla v23.8h, v7.8h, v13.8h\n" + "ldr q13, [x25, x14]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v27.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "fmla v24.8h, v3.8h, v12.8h\n" + "ldr q12, [x24, x14]\n" + "fmla v31.8h, v2.8h, v11.8h\n" + "fmla v30.8h, v1.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x23, x14]\n" + "fmla v27.8h, v8.8h, v13.8h\n" + "fmla v26.8h, v7.8h, v13.8h\n" + "fmla v24.8h, v5.8h, v13.8h\n" + "fmla v23.8h, v4.8h, v13.8h\n" + "ldr q13, [x10, x14]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v6.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v25.8h, v0.8h, v12.8h\n" + "fmla v29.8h, v8.8h, v11.8h\n" + "fmla v26.8h, v5.8h, v11.8h\n" + "fmla v23.8h, v2.8h, v11.8h\n" + "fmla v25.8h, v8.8h, v13.8h\n" + "fmla v24.8h, v7.8h, v13.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmla v23.8h, v6.8h, v13.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "str q31, [x22, x12]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "ldr x22, [x17, #0x20]\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q30, [x21, x12]\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v27.8h, v27.8h, v18.8h\n" + "ldr x21, [x17, #0x28]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q29, [x20, x12]\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "fmax v26.8h, v26.8h, v18.8h\n" + "str q28, [x19, x12]\n" + "fmax v25.8h, v25.8h, v18.8h\n" + "ldr x20, [x17, #0x30]\n" + "fmax v24.8h, v24.8h, v18.8h\n" + "str q27, [x22, x12]\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "ldr x19, [x17, #0x38]\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "ldr x22, [x17, #0x40]\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "str q26, [x21, x12]\n" + "fmax v23.8h, v23.8h, v18.8h\n" + "str q25, [x20, x12]\n" + "str q24, [x19, x12]\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "str q23, [x22, x12]\n" + "3:" // Oddments + "tst %x[n_channels], #0x1\n" + "beq 48f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x12, x14\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x10, [x16, #0x0]\n" + "add x10, x10, x14\n" + "ldr x9, [x16, #0x8]\n" + "ldr x28, [x16, #0x10]\n" + "add x9, x9, x14\n" + "ldr x27, [x16, #0x18]\n" + "ldr x26, [x16, #0x20]\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.s }[0], [x10], #0x4\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "ld1 { v11.s }[0], [x28], #0x4\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "ld1 { v13.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.h }[2], [x10], #0x2\n" + "ld1 { v10.h }[2], [x9], #0x2\n" + "ld1 { v11.h }[2], [x28], #0x2\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "ld1 { v13.h }[2], [x26], #0x2\n" + "b 5f\n" + "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset + "ld1 { v9.h }[0], [x10], #0x2\n" + "ld1 { v10.h }[0], [x9], #0x2\n" + "ld1 { v11.h }[0], [x28], #0x2\n" + "ld1 { v12.h }[0], [x27], #0x2\n" + "ld1 { v13.h }[0], [x26], #0x2\n" + "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x25, [x16, #0x28]\n" + "add x25, x25, x14\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n" + "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n" + "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n" + "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n" + "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n" + "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "fmla v25.8h, v6.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v13.8h\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v13.8h\n" + "fmla v28.8h, v2.8h, v13.8h\n" + "fmla v27.8h, v1.8h, v13.8h\n" + "fmla v26.8h, v0.8h, v13.8h\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v12.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v12.h }[2], [x25], #0x2\n" + "b 7f\n" + "6:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v12.h }[0], [x25], #0x2\n" + "7:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v23.8h, v8.8h, v12.8h\n" + "ldr x24, [x16, #0x30]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "b 9f\n" + "8:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v11.h }[0], [x24], #0x2\n" + "9:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v31.8h, v7.8h, v11.8h\n" + "ldr x23, [x16, #0x38]\n" + "fmla v30.8h, v6.8h, v11.8h\n" + "add x23, x23, x14\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v27.8h, v3.8h, v11.8h\n" + "fmla v25.8h, v1.8h, v11.8h\n" + "fmla v24.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v13.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v13.h }[2], [x23], #0x2\n" + "b 11f\n" + "10:" // Oddments: Load input (0, 1): Bit 1: Unset + "ld1 { v13.h }[0], [x23], #0x2\n" + "11:" // Oddments: Load input (0, 1): Bit 1: End + "fmla v31.8h, v1.8h, v13.8h\n" + "ldr x10, [x16, #0x40]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "add x10, x10, x14\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v12.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v12.h }[2], [x10], #0x2\n" + "b 13f\n" + "12:" // Oddments: Load input (0, 3): Bit 1: Unset + "ld1 { v12.h }[0], [x10], #0x2\n" + "13:" // Oddments: Load input (0, 3): Bit 1: End + "fmla v30.8h, v2.8h, v12.8h\n" + "ldr x9, [x16, #0x48]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "add x9, x9, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v10.h }[2], [x9], #0x2\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v10.h }[0], [x9], #0x2\n" + "15:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v30.8h, v8.8h, v10.8h\n" + "ldr x28, [x16, #0x50]\n" + "fmla v29.8h, v7.8h, v10.8h\n" + "add x28, x28, x14\n" + "fmla v27.8h, v5.8h, v10.8h\n" + "fmla v26.8h, v4.8h, v10.8h\n" + "fmla v24.8h, v2.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v11.s }[0], [x28], #0x4\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v11.h }[2], [x28], #0x2\n" + "b 17f\n" + "16:" // Oddments: Load input (1, 0): Bit 1: Unset + "ld1 { v11.h }[0], [x28], #0x2\n" + "17:" // Oddments: Load input (1, 0): Bit 1: End + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr x27, [x16, #0x58]\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v13.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v13.h }[2], [x27], #0x2\n" + "b 19f\n" + "18:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v13.h }[0], [x27], #0x2\n" + "19:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v29.8h, v5.8h, v13.8h\n" + "ldr x26, [x16, #0x60]\n" + "fmla v26.8h, v2.8h, v13.8h\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v12.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v12.h }[2], [x26], #0x2\n" + "b 21f\n" + "20:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v12.h }[0], [x26], #0x2\n" + "21:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr x25, [x16, #0x68]\n" + "fmla v25.8h, v3.8h, v12.8h\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v10.h }[2], [x25], #0x2\n" + "b 23f\n" + "22:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v10.h }[0], [x25], #0x2\n" + "23:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v28.8h, v8.8h, v10.8h\n" + "ldr x24, [x16, #0x70]\n" + "fmla v27.8h, v7.8h, v10.8h\n" + "add x24, x24, x14\n" + "fmla v26.8h, v6.8h, v10.8h\n" + "fmla v25.8h, v5.8h, v10.8h\n" + "fmla v24.8h, v4.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x24], #0x2\n" + "25:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v26.8h, v8.8h, v11.8h\n" + "ldr x23, [x16, #0x78]\n" + "fmla v23.8h, v5.8h, v11.8h\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v13.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v13.h }[2], [x23], #0x2\n" + "b 27f\n" + "26:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v13.h }[0], [x23], #0x2\n" + "27:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v25.8h, v7.8h, v13.8h\n" + "ldr x10, [x16, #0x80]\n" + "fmla v24.8h, v6.8h, v13.8h\n" + "add x10, x10, x14\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v12.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v12.h }[2], [x10], #0x2\n" + "b 29f\n" + "28:" // Oddments: Load input (1, 1): Bit 1: Unset + "ld1 { v12.h }[0], [x10], #0x2\n" + "29:" // Oddments: Load input (1, 1): Bit 1: End + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr x9, [x16, #0x88]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "add x9, x9, x14\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v27.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v11.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v11.h }[2], [x9], #0x2\n" + "b 31f\n" + "30:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v11.h }[0], [x9], #0x2\n" + "31:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.8h, v5.8h, v11.8h\n" + "ldr x28, [x16, #0x90]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "add x28, x28, x14\n" + "fmla v27.8h, v2.8h, v11.8h\n" + "fmla v26.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v13.s }[0], [x28], #0x4\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v13.h }[2], [x28], #0x2\n" + "b 33f\n" + "32:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v13.h }[0], [x28], #0x2\n" + "33:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v24.8h, v8.8h, v13.8h\n" + "ldr x27, [x16, #0x98]\n" + "fmla v23.8h, v7.8h, v13.8h\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "b 35f\n" + "34:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v12.h }[0], [x27], #0x2\n" + "35:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v28.8h, v7.8h, v12.8h\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v27.8h, v6.8h, v12.8h\n" + "add x26, x26, x14\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "fmla v24.8h, v3.8h, v12.8h\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v11.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v11.h }[2], [x26], #0x2\n" + "b 37f\n" + "36:" // Oddments: Load input (0, 2): Bit 1: Unset + "ld1 { v11.h }[0], [x26], #0x2\n" + "37:" // Oddments: Load input (0, 2): Bit 1: End + "fmla v31.8h, v2.8h, v11.8h\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v30.8h, v1.8h, v11.8h\n" + "add x25, x25, x14\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v13.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v13.h }[2], [x25], #0x2\n" + "b 39f\n" + "38:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v13.h }[0], [x25], #0x2\n" + "39:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v27.8h, v8.8h, v13.8h\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.8h, v7.8h, v13.8h\n" + "add x24, x24, x14\n" + "fmla v24.8h, v5.8h, v13.8h\n" + "fmla v23.8h, v4.8h, v13.8h\n" + "tbz %x[n_channels], #1, 40f\n" + "ld1 { v12.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 41f\n" + "ld1 { v12.h }[2], [x24], #0x2\n" + "b 41f\n" + "40:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v12.h }[0], [x24], #0x2\n" + "41:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v12.8h\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "add x23, x23, x14\n" + "fmla v25.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 42f\n" + "ld1 { v11.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 43f\n" + "ld1 { v11.h }[2], [x23], #0x2\n" + "b 43f\n" + "42:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x23], #0x2\n" + "43:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v29.8h, v8.8h, v11.8h\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v26.8h, v5.8h, v11.8h\n" + "add x10, x10, x14\n" + "fmla v23.8h, v2.8h, v11.8h\n" + "tbz %x[n_channels], #1, 44f\n" + "ld1 { v13.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 45f\n" + "ld1 { v13.h }[2], [x10], #0x2\n" + "b 45f\n" + "44:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v13.h }[0], [x10], #0x2\n" + "45:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v25.8h, v8.8h, v13.8h\n" + "fmla v24.8h, v7.8h, v13.8h\n" + "fmla v23.8h, v6.8h, v13.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "fmax v27.8h, v27.8h, v18.8h\n" + "fmax v26.8h, v26.8h, v18.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmax v25.8h, v25.8h, v18.8h\n" + "fmax v24.8h, v24.8h, v18.8h\n" + "fmax v23.8h, v23.8h, v18.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "tbz %x[n_channels], #1, 46f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.s }[0], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.s }[0], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.s }[0], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.s }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.s }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.s }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.s }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.s }[0], [x19]\n" + "add x12, x12, #0x4\n" + "st1 { v23.s }[0], [x22]\n" + "tbz %x[n_channels], #0, 47f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.h }[2], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.h }[2], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.h }[2], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.h }[2], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.h }[2], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.h }[2], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.h }[2], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.h }[2], [x19]\n" + "st1 { v23.h }[2], [x22]\n" + "b 47f\n" + "46:" // Oddments: Store: Bit 1: Unset + "ldr x22, [x17, #0x0]\n" + "add x22, x22, x12\n" + "ldr x21, [x17, #0x8]\n" + "ldr x20, [x17, #0x10]\n" + "add x21, x21, x12\n" + "st1 { v31.h }[0], [x22]\n" + "ldr x19, [x17, #0x18]\n" + "add x20, x20, x12\n" + "st1 { v30.h }[0], [x21]\n" + "add x19, x19, x12\n" + "st1 { v29.h }[0], [x20]\n" + "ldr x22, [x17, #0x20]\n" + "add x22, x22, x12\n" + "st1 { v28.h }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.h }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.h }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.h }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.h }[0], [x19]\n" + "st1 { v23.h }[0], [x22]\n" + "47:" // Oddments: Store: Bit 1: End + + "48:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp new file mode 100644 index 0000000000..df5328724d --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + + a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..bf18469199 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,1233 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x4, #0x0\n" + "mov x26, #0x0\n" + "1:" // Tile loop + "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x25, #0x4\n" + "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x24, #0x4\n" + "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n" + "add x23, %x[params_struct], %[offsetof_args_min]\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x7, #0x0\n" + "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x4, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col + "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x25\n" // offset *= kernel_stride * output_size + "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1r { v15.8h }, [x23]\n" + "add x15, x8, x22, LSL #1\n" + "ld1r { v14.8h }, [x21]\n" + "add x14, x15, x22, LSL #1\n" + "lsl x6, x6, #0x1\n" + "add x13, x14, x22, LSL #1\n" + "add x12, x13, x22, LSL #1\n" + "add x11, x12, x22, LSL #1\n" + "add x10, x6, x6\n" + "add x9, x10, x6\n" + "add x28, x9, x6\n" + "add x27, x28, x6\n" + "mul x19, x4, x20\n" // offset = tile_i * ld_output_row + "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x24\n" // offset *= output_tile_size + "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x26, x16, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "lsl x17, x17, #0x1\n" + "add x23, x17, x17\n" + "add x22, x23, x17\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x3\n" + "cbz x19, 4f\n" + "ldr q13, [x5, #0x0]\n" + "ldr q0, [x5, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x5, #0x20]\n" + "ldr q2, [x5, #0x30]\n" + "ldr q3, [x5, #0x40]\n" + "ldr q4, [x5, #0x50]\n" + "ldr q5, [x5, #0x60]\n" + "ldr q6, [x5, #0x70]\n" + "ldr q7, [x5, #0x80]\n" + "ldr q8, [x5, #0x90]\n" + "add x5, x5, #0xa0\n" + "ldr q9, [x14, x10]\n" + "ld1 { v10.8h }, [x8]\n" + "ldr q11, [x8, x27]\n" + "ldr q12, [x14, x9]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "add x7, x7, #0x10\n" + "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "add x21, x21, #0x10\n" + "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n" + "cmp x21, x19, LSL #4\n" + "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n" + "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n" + "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n" + "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n" + "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n" + "ldr q9, [x13, x10]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x11]\n" + "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n" + "ldr q11, [x11, x27]\n" + "fmla v30.8h, v8.8h, v12.8h\n" + "fmla v29.8h, v7.8h, v12.8h\n" + "fmla v26.8h, v5.8h, v12.8h\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n" + "fmla v22.8h, v2.8h, v12.8h\n" + "fmla v21.8h, v1.8h, v12.8h\n" + "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n" + "ldr q12, [x8, x6]\n" + "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n" + "ldr q10, [x13, x9]\n" + "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n" + "ldr q11, [x8, x28]\n" + "fmla v27.8h, v8.8h, v9.8h\n" + "fmla v26.8h, v7.8h, v9.8h\n" + "fmla v25.8h, v6.8h, v9.8h\n" + "fmla v23.8h, v5.8h, v9.8h\n" + "fmla v22.8h, v4.8h, v9.8h\n" + "fmla v21.8h, v3.8h, v9.8h\n" + "fmla v19.8h, v2.8h, v9.8h\n" + "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n" + "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n" + "ld1 { v9.8h }, [x15]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "ldr q13, [x5, #0x0]\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x15, x27]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "fmla v28.8h, v1.8h, v11.8h\n" + "ld1 { v11.8h }, [x12]\n" + "fmla v26.8h, v8.8h, v10.8h\n" + "fmla v25.8h, v7.8h, v10.8h\n" + "fmla v24.8h, v6.8h, v10.8h\n" + "fmla v22.8h, v5.8h, v10.8h\n" + "fmla v21.8h, v4.8h, v10.8h\n" + "fmla v20.8h, v3.8h, v10.8h\n" + "fmla v18.8h, v2.8h, v10.8h\n" + "fmla v17.8h, v1.8h, v10.8h\n" + "fmla v16.8h, v0.8h, v10.8h\n" + "ldr q10, [x15, x10]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "fmla v27.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v2.8h, v12.8h\n" + "ldr q12, [x15, x9]\n" + "fmla v23.8h, v6.8h, v11.8h\n" + "fmla v19.8h, v3.8h, v11.8h\n" + "ldr q11, [x12, x27]\n" + "fmla v31.8h, v5.8h, v10.8h\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v2.8h, v10.8h\n" + "fmla v26.8h, v1.8h, v10.8h\n" + "fmla v25.8h, v0.8h, v10.8h\n" + "ldr q10, [x14, x6]\n" + "fmla v20.8h, v8.8h, v11.8h\n" + "fmla v16.8h, v5.8h, v11.8h\n" + "ldr q11, [x11, x6]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v26.8h, v2.8h, v12.8h\n" + "fmla v25.8h, v1.8h, v12.8h\n" + "fmla v24.8h, v0.8h, v12.8h\n" + "ldr q12, [x14, x28]\n" + "fmla v19.8h, v7.8h, v11.8h\n" + "fmla v18.8h, v6.8h, v11.8h\n" + "ldr q11, [x11, x28]\n" + "fmla v31.8h, v7.8h, v10.8h\n" + "fmla v30.8h, v6.8h, v10.8h\n" + "fmla v27.8h, v4.8h, v10.8h\n" + "fmla v26.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v22.8h, v0.8h, v10.8h\n" + "ldr q10, [x8, x10]\n" + "fmla v17.8h, v8.8h, v11.8h\n" + "fmla v16.8h, v7.8h, v11.8h\n" + "ldr q11, [x13, x6]\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v25.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v4.8h, v12.8h\n" + "fmla v21.8h, v2.8h, v12.8h\n" + "fmla v20.8h, v1.8h, v12.8h\n" + "ldr q12, [x8, x9]\n" + "add x8, x8, #0x10\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x14]\n" + "fmla v27.8h, v7.8h, v11.8h\n" + "fmla v26.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v4.8h, v11.8h\n" + "fmla v22.8h, v3.8h, v11.8h\n" + "fmla v19.8h, v1.8h, v11.8h\n" + "fmla v18.8h, v0.8h, v11.8h\n" + "ldr q11, [x13, x28]\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr q12, [x14, x27]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v6.8h, v10.8h\n" + "ldr q9, [x14, x10]\n" + "fmla v27.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x13]\n" + "fmla v25.8h, v8.8h, v11.8h\n" + "fmla v24.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v5.8h, v11.8h\n" + "fmla v20.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v2.8h, v11.8h\n" + "fmla v16.8h, v1.8h, v11.8h\n" + "ldr q11, [x12, x10]\n" + "fmla v28.8h, v8.8h, v12.8h\n" + "fmla v24.8h, v5.8h, v12.8h\n" + "fmla v20.8h, v2.8h, v12.8h\n" + "ldr q12, [x13, x27]\n" + "add x13, x13, #0x10\n" + "fmla v27.8h, v6.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v19.8h, v0.8h, v10.8h\n" + "ldr q10, [x11, x10]\n" + "fmla v22.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v8.8h, v11.8h\n" + "fmla v19.8h, v5.8h, v11.8h\n" + "fmla v18.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v3.8h, v11.8h\n" + "ldr q11, [x12, x9]\n" + "fmla v24.8h, v8.8h, v12.8h\n" + "fmla v20.8h, v5.8h, v12.8h\n" + "fmla v16.8h, v2.8h, v12.8h\n" + "ldr q12, [x11, x9]\n" + "add x11, x11, #0x10\n" + "fmla v19.8h, v8.8h, v10.8h\n" + "fmla v18.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v6.8h, v10.8h\n" + "ldr q10, [x15, x6]\n" + "fmla v22.8h, v8.8h, v11.8h\n" + "fmla v21.8h, v7.8h, v11.8h\n" + "fmla v20.8h, v6.8h, v11.8h\n" + "fmla v18.8h, v5.8h, v11.8h\n" + "fmla v17.8h, v4.8h, v11.8h\n" + "fmla v16.8h, v3.8h, v11.8h\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v18.8h, v8.8h, v12.8h\n" + "fmla v31.8h, v4.8h, v10.8h\n" + "fmla v17.8h, v7.8h, v12.8h\n" + "fmla v16.8h, v6.8h, v12.8h\n" + "ldr q12, [x12, x6]\n" + "fmla v30.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v1.8h, v10.8h\n" + "fmla v26.8h, v0.8h, v10.8h\n" + "ldr q10, [x12, x28]\n" + "add x12, x12, #0x10\n" + "fmla v29.8h, v5.8h, v11.8h\n" + "ldr q0, [x5, #0x10]\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v25.8h, v2.8h, v11.8h\n" + "ldr q2, [x5, #0x30]\n" + "fmla v24.8h, v1.8h, v11.8h\n" + "ldr q11, [x8, x27]\n" + "fmla v23.8h, v7.8h, v12.8h\n" + "ldr q1, [x5, #0x20]\n" + "fmla v22.8h, v6.8h, v12.8h\n" + "ldr q6, [x5, #0x70]\n" + "fmla v19.8h, v4.8h, v12.8h\n" + "fmla v18.8h, v3.8h, v12.8h\n" + "ldr q12, [x14, x9]\n" + "fmla v21.8h, v8.8h, v10.8h\n" + "ldr q3, [x5, #0x40]\n" + "fmla v20.8h, v7.8h, v10.8h\n" + "ldr q7, [x5, #0x80]\n" + "fmla v17.8h, v5.8h, v10.8h\n" + "ldr q5, [x5, #0x60]\n" + "fmla v16.8h, v4.8h, v10.8h\n" + "ld1 { v10.8h }, [x8]\n" + "fmax v31.8h, v31.8h, v15.8h\n" + "ldr q4, [x5, #0x50]\n" + "fmax v30.8h, v30.8h, v15.8h\n" + "ldr q8, [x5, #0x90]\n" + "add x5, x5, #0xa0\n" + "fmin v31.8h, v31.8h, v14.8h\n" + "st1 { v31.8h }, [x16]\n" + "fmin v30.8h, v30.8h, v14.8h\n" + "fmax v29.8h, v29.8h, v15.8h\n" + "str q30, [x16, x17]\n" + "fmin v29.8h, v29.8h, v14.8h\n" + "fmax v28.8h, v28.8h, v15.8h\n" + "str q29, [x16, x23]\n" + "fmin v28.8h, v28.8h, v14.8h\n" + "fmax v27.8h, v27.8h, v15.8h\n" + "str q28, [x16, x22]\n" + "fmin v27.8h, v27.8h, v14.8h\n" + "add x16, x16, #0x10\n" + "fmax v26.8h, v26.8h, v15.8h\n" + "st1 { v27.8h }, [x26]\n" + "fmax v25.8h, v25.8h, v15.8h\n" + "fmax v24.8h, v24.8h, v15.8h\n" + "fmin v26.8h, v26.8h, v14.8h\n" + "str q26, [x26, x17]\n" + "fmin v25.8h, v25.8h, v14.8h\n" + "fmin v24.8h, v24.8h, v14.8h\n" + "str q25, [x26, x23]\n" + "fmax v23.8h, v23.8h, v15.8h\n" + "fmax v22.8h, v22.8h, v15.8h\n" + "str q24, [x26, x22]\n" + "add x26, x26, #0x10\n" + "fmax v21.8h, v21.8h, v15.8h\n" + "fmax v20.8h, v20.8h, v15.8h\n" + "fmin v23.8h, v23.8h, v14.8h\n" + "st1 { v23.8h }, [x25]\n" + "fmin v22.8h, v22.8h, v14.8h\n" + "fmin v21.8h, v21.8h, v14.8h\n" + "str q22, [x25, x17]\n" + "fmin v20.8h, v20.8h, v14.8h\n" + "fmax v19.8h, v19.8h, v15.8h\n" + "str q21, [x25, x23]\n" + "fmax v18.8h, v18.8h, v15.8h\n" + "str q20, [x25, x22]\n" + "fmin v19.8h, v19.8h, v14.8h\n" + "add x25, x25, #0x10\n" + "fmin v18.8h, v18.8h, v14.8h\n" + "st1 { v19.8h }, [x24]\n" + "fmax v17.8h, v17.8h, v15.8h\n" + "fmax v16.8h, v16.8h, v15.8h\n" + "str q18, [x24, x17]\n" + "fmin v17.8h, v17.8h, v14.8h\n" + "str q17, [x24, x23]\n" + "fmin v16.8h, v16.8h, v14.8h\n" + "str q16, [x24, x22]\n" + "add x24, x24, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n" + "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n" + "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n" + "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n" + "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n" + "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n" + "ldr q9, [x13, x10]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x11]\n" + "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n" + "ldr q11, [x11, x27]\n" + "fmla v30.8h, v8.8h, v12.8h\n" + "fmla v29.8h, v7.8h, v12.8h\n" + "fmla v26.8h, v5.8h, v12.8h\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n" + "fmla v22.8h, v2.8h, v12.8h\n" + "fmla v21.8h, v1.8h, v12.8h\n" + "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n" + "ldr q12, [x8, x6]\n" + "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n" + "ldr q10, [x13, x9]\n" + "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n" + "ldr q11, [x8, x28]\n" + "fmla v27.8h, v8.8h, v9.8h\n" + "fmla v26.8h, v7.8h, v9.8h\n" + "fmla v25.8h, v6.8h, v9.8h\n" + "fmla v23.8h, v5.8h, v9.8h\n" + "fmla v22.8h, v4.8h, v9.8h\n" + "fmla v21.8h, v3.8h, v9.8h\n" + "fmla v19.8h, v2.8h, v9.8h\n" + "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n" + "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n" + "ld1 { v9.8h }, [x15]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x15, x27]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "fmla v28.8h, v1.8h, v11.8h\n" + "ld1 { v11.8h }, [x12]\n" + "fmla v26.8h, v8.8h, v10.8h\n" + "fmla v25.8h, v7.8h, v10.8h\n" + "fmla v24.8h, v6.8h, v10.8h\n" + "fmla v22.8h, v5.8h, v10.8h\n" + "fmla v21.8h, v4.8h, v10.8h\n" + "fmla v20.8h, v3.8h, v10.8h\n" + "fmla v18.8h, v2.8h, v10.8h\n" + "fmla v17.8h, v1.8h, v10.8h\n" + "fmla v16.8h, v0.8h, v10.8h\n" + "ldr q10, [x15, x10]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "fmla v27.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v2.8h, v12.8h\n" + "ldr q12, [x15, x9]\n" + "fmla v23.8h, v6.8h, v11.8h\n" + "fmla v19.8h, v3.8h, v11.8h\n" + "ldr q11, [x12, x27]\n" + "fmla v31.8h, v5.8h, v10.8h\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v2.8h, v10.8h\n" + "fmla v26.8h, v1.8h, v10.8h\n" + "fmla v25.8h, v0.8h, v10.8h\n" + "ldr q10, [x14, x6]\n" + "fmla v20.8h, v8.8h, v11.8h\n" + "fmla v16.8h, v5.8h, v11.8h\n" + "ldr q11, [x11, x6]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v26.8h, v2.8h, v12.8h\n" + "fmla v25.8h, v1.8h, v12.8h\n" + "fmla v24.8h, v0.8h, v12.8h\n" + "ldr q12, [x14, x28]\n" + "fmla v19.8h, v7.8h, v11.8h\n" + "fmla v18.8h, v6.8h, v11.8h\n" + "ldr q11, [x11, x28]\n" + "fmla v31.8h, v7.8h, v10.8h\n" + "fmla v30.8h, v6.8h, v10.8h\n" + "fmla v27.8h, v4.8h, v10.8h\n" + "fmla v26.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v22.8h, v0.8h, v10.8h\n" + "ldr q10, [x8, x10]\n" + "fmla v17.8h, v8.8h, v11.8h\n" + "fmla v16.8h, v7.8h, v11.8h\n" + "ldr q11, [x13, x6]\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v25.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v4.8h, v12.8h\n" + "fmla v21.8h, v2.8h, v12.8h\n" + "fmla v20.8h, v1.8h, v12.8h\n" + "ldr q12, [x8, x9]\n" + "add x8, x8, #0x10\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x14]\n" + "fmla v27.8h, v7.8h, v11.8h\n" + "fmla v26.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v4.8h, v11.8h\n" + "fmla v22.8h, v3.8h, v11.8h\n" + "fmla v19.8h, v1.8h, v11.8h\n" + "fmla v18.8h, v0.8h, v11.8h\n" + "ldr q11, [x13, x28]\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr q12, [x14, x27]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v6.8h, v10.8h\n" + "fmla v27.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x13]\n" + "fmla v25.8h, v8.8h, v11.8h\n" + "fmla v24.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v5.8h, v11.8h\n" + "fmla v20.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v2.8h, v11.8h\n" + "fmla v16.8h, v1.8h, v11.8h\n" + "ldr q11, [x12, x10]\n" + "fmla v28.8h, v8.8h, v12.8h\n" + "fmla v24.8h, v5.8h, v12.8h\n" + "fmla v20.8h, v2.8h, v12.8h\n" + "ldr q12, [x13, x27]\n" + "add x13, x13, #0x10\n" + "fmla v27.8h, v6.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v19.8h, v0.8h, v10.8h\n" + "ldr q10, [x11, x10]\n" + "fmla v22.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v8.8h, v11.8h\n" + "fmla v19.8h, v5.8h, v11.8h\n" + "fmla v18.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v3.8h, v11.8h\n" + "ldr q11, [x12, x9]\n" + "fmla v24.8h, v8.8h, v12.8h\n" + "fmla v20.8h, v5.8h, v12.8h\n" + "fmla v16.8h, v2.8h, v12.8h\n" + "ldr q12, [x11, x9]\n" + "add x11, x11, #0x10\n" + "fmla v19.8h, v8.8h, v10.8h\n" + "fmla v18.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v6.8h, v10.8h\n" + "ldr q10, [x15, x6]\n" + "fmla v22.8h, v8.8h, v11.8h\n" + "fmla v21.8h, v7.8h, v11.8h\n" + "fmla v20.8h, v6.8h, v11.8h\n" + "fmla v18.8h, v5.8h, v11.8h\n" + "fmla v17.8h, v4.8h, v11.8h\n" + "fmla v16.8h, v3.8h, v11.8h\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v18.8h, v8.8h, v12.8h\n" + "fmla v31.8h, v4.8h, v10.8h\n" + "fmla v17.8h, v7.8h, v12.8h\n" + "fmla v16.8h, v6.8h, v12.8h\n" + "ldr q12, [x12, x6]\n" + "fmla v30.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v1.8h, v10.8h\n" + "fmla v26.8h, v0.8h, v10.8h\n" + "ldr q10, [x12, x28]\n" + "add x12, x12, #0x10\n" + "fmla v29.8h, v5.8h, v11.8h\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v25.8h, v2.8h, v11.8h\n" + "fmla v24.8h, v1.8h, v11.8h\n" + "fmla v23.8h, v7.8h, v12.8h\n" + "fmla v22.8h, v6.8h, v12.8h\n" + "fmla v19.8h, v4.8h, v12.8h\n" + "fmla v18.8h, v3.8h, v12.8h\n" + "fmla v21.8h, v8.8h, v10.8h\n" + "fmla v20.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v5.8h, v10.8h\n" + "fmla v16.8h, v4.8h, v10.8h\n" + "fmax v31.8h, v31.8h, v15.8h\n" + "fmax v30.8h, v30.8h, v15.8h\n" + "fmax v29.8h, v29.8h, v15.8h\n" + "fmin v31.8h, v31.8h, v14.8h\n" + "st1 { v31.8h }, [x16]\n" + "fmin v30.8h, v30.8h, v14.8h\n" + "fmin v29.8h, v29.8h, v14.8h\n" + "str q30, [x16, x17]\n" + "fmax v28.8h, v28.8h, v15.8h\n" + "fmax v27.8h, v27.8h, v15.8h\n" + "str q29, [x16, x23]\n" + "fmax v26.8h, v26.8h, v15.8h\n" + "fmax v25.8h, v25.8h, v15.8h\n" + "fmin v28.8h, v28.8h, v14.8h\n" + "str q28, [x16, x22]\n" + "fmin v27.8h, v27.8h, v14.8h\n" + "add x16, x16, #0x10\n" + "fmin v26.8h, v26.8h, v14.8h\n" + "st1 { v27.8h }, [x26]\n" + "fmin v25.8h, v25.8h, v14.8h\n" + "fmax v24.8h, v24.8h, v15.8h\n" + "str q26, [x26, x17]\n" + "fmax v23.8h, v23.8h, v15.8h\n" + "str q25, [x26, x23]\n" + "fmin v24.8h, v24.8h, v14.8h\n" + "fmax v22.8h, v22.8h, v15.8h\n" + "str q24, [x26, x22]\n" + "fmin v23.8h, v23.8h, v14.8h\n" + "add x26, x26, #0x10\n" + "fmin v22.8h, v22.8h, v14.8h\n" + "st1 { v23.8h }, [x25]\n" + "fmax v21.8h, v21.8h, v15.8h\n" + "fmax v20.8h, v20.8h, v15.8h\n" + "str q22, [x25, x17]\n" + "fmax v19.8h, v19.8h, v15.8h\n" + "fmax v18.8h, v18.8h, v15.8h\n" + "fmin v21.8h, v21.8h, v14.8h\n" + "str q21, [x25, x23]\n" + "fmin v20.8h, v20.8h, v14.8h\n" + "fmin v19.8h, v19.8h, v14.8h\n" + "str q20, [x25, x22]\n" + "fmin v18.8h, v18.8h, v14.8h\n" + "add x25, x25, #0x10\n" + "fmax v17.8h, v17.8h, v15.8h\n" + "st1 { v19.8h }, [x24]\n" + "fmax v16.8h, v16.8h, v15.8h\n" + "str q18, [x24, x17]\n" + "fmin v17.8h, v17.8h, v14.8h\n" + "str q17, [x24, x23]\n" + "fmin v16.8h, v16.8h, v14.8h\n" + "str q16, [x24, x22]\n" + "add x24, x24, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x1\n" + "beq 73f\n" + "ldr q13, [x5, #0x0]\n" + "ldr q0, [x5, #0x10]\n" + "add x22, x14, x10\n" + "ldr q1, [x5, #0x20]\n" + "add x21, x8, XZR\n" + "ldr q2, [x5, #0x30]\n" + "add x20, x8, x27\n" + "ldr q3, [x5, #0x40]\n" + "add x19, x14, x9\n" + "ldr q4, [x5, #0x50]\n" + "ldr q5, [x5, #0x60]\n" + "ldr q6, [x5, #0x70]\n" + "ldr q7, [x5, #0x80]\n" + "ldr q8, [x5, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr s9, [x22], #0x4\n" + "ldr s10, [x21], #0x4\n" + "ldr s11, [x20], #0x4\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.h }[2], [x22]\n" + "ld1 { v10.h }[2], [x21]\n" + "ld1 { v11.h }[2], [x20]\n" + "ld1 { v12.h }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset + "ldr h9, [x22, #0x0]\n" + "ldr h10, [x21, #0x0]\n" + "ldr h11, [x20, #0x0]\n" + "ldr h12, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End + "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x19, x11, XZR\n" + "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n" + "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n" + "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n" + "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n" + "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n" + "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n" + "fmla v30.8h, v8.8h, v12.8h\n" + "fmla v29.8h, v7.8h, v12.8h\n" + "fmla v26.8h, v5.8h, v12.8h\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n" + "fmla v22.8h, v2.8h, v12.8h\n" + "fmla v21.8h, v1.8h, v12.8h\n" + "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End + "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n" + "add x19, x11, x27\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End + "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n" + "add x19, x13, x10\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v27.8h, v8.8h, v9.8h\n" + "add x19, x8, x6\n" + "fmla v26.8h, v7.8h, v9.8h\n" + "fmla v25.8h, v6.8h, v9.8h\n" + "fmla v23.8h, v5.8h, v9.8h\n" + "fmla v22.8h, v4.8h, v9.8h\n" + "fmla v21.8h, v3.8h, v9.8h\n" + "fmla v19.8h, v2.8h, v9.8h\n" + "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n" + "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End + "fmla v31.8h, v1.8h, v12.8h\n" + "add x19, x8, x28\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End + "fmla v29.8h, v2.8h, v11.8h\n" + "add x19, x13, x9\n" + "fmla v28.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v26.8h, v8.8h, v10.8h\n" + "add x19, x15, XZR\n" + "fmla v25.8h, v7.8h, v10.8h\n" + "fmla v24.8h, v6.8h, v10.8h\n" + "fmla v22.8h, v5.8h, v10.8h\n" + "fmla v21.8h, v4.8h, v10.8h\n" + "fmla v20.8h, v3.8h, v10.8h\n" + "fmla v18.8h, v2.8h, v10.8h\n" + "fmla v17.8h, v1.8h, v10.8h\n" + "fmla v16.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End + "fmla v31.8h, v3.8h, v9.8h\n" + "add x19, x15, x27\n" + "fmla v27.8h, v0.8h, v9.8h\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End + "fmla v28.8h, v5.8h, v12.8h\n" + "add x19, x12, XZR\n" + "fmla v24.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End + "fmla v23.8h, v6.8h, v11.8h\n" + "add x19, x15, x10\n" + "fmla v19.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End + "fmla v31.8h, v5.8h, v10.8h\n" + "add x19, x12, x27\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v2.8h, v10.8h\n" + "fmla v26.8h, v1.8h, v10.8h\n" + "fmla v25.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End + "fmla v20.8h, v8.8h, v11.8h\n" + "add x19, x15, x9\n" + "fmla v16.8h, v5.8h, v11.8h\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.8h, v5.8h, v12.8h\n" + "add x19, x11, x6\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v26.8h, v2.8h, v12.8h\n" + "fmla v25.8h, v1.8h, v12.8h\n" + "fmla v24.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End + "fmla v19.8h, v7.8h, v11.8h\n" + "add x19, x14, x6\n" + "fmla v18.8h, v6.8h, v11.8h\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v31.8h, v7.8h, v10.8h\n" + "add x19, x11, x28\n" + "fmla v30.8h, v6.8h, v10.8h\n" + "fmla v27.8h, v4.8h, v10.8h\n" + "fmla v26.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v22.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End + "fmla v17.8h, v8.8h, v11.8h\n" + "add x19, x14, x28\n" + "fmla v16.8h, v7.8h, v11.8h\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v29.8h, v8.8h, v12.8h\n" + "add x19, x8, x10\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v25.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v4.8h, v12.8h\n" + "fmla v21.8h, v2.8h, v12.8h\n" + "fmla v20.8h, v1.8h, v12.8h\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End + "fmla v31.8h, v2.8h, v10.8h\n" + "add x19, x13, x6\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 41f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 42f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v27.8h, v7.8h, v11.8h\n" + "add x19, x8, x9\n" + "fmla v26.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v4.8h, v11.8h\n" + "fmla v22.8h, v3.8h, v11.8h\n" + "fmla v19.8h, v1.8h, v11.8h\n" + "fmla v18.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 43f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 44f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 44f\n" + "43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End + "fmla v30.8h, v2.8h, v12.8h\n" + "add x19, x14, XZR\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 45f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 46f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 46f\n" + "45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v10.8h\n" + "add x19, x13, x28\n" + "fmla v27.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 47f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 48f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 48f\n" + "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v25.8h, v8.8h, v11.8h\n" + "add x19, x14, x27\n" + "fmla v24.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v5.8h, v11.8h\n" + "fmla v20.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v2.8h, v11.8h\n" + "fmla v16.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 49f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 50f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 50f\n" + "49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End + "fmla v28.8h, v8.8h, v12.8h\n" + "add x19, x13, XZR\n" + "fmla v24.8h, v5.8h, v12.8h\n" + "fmla v20.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 51f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 52f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 52f\n" + "51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v27.8h, v6.8h, v10.8h\n" + "add x19, x12, x10\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v19.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 53f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 54f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 54f\n" + "53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v23.8h, v8.8h, v11.8h\n" + "add x19, x13, x27\n" + "fmla v22.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v6.8h, v11.8h\n" + "fmla v19.8h, v5.8h, v11.8h\n" + "fmla v18.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 55f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 56f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 56f\n" + "55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End + "fmla v24.8h, v8.8h, v12.8h\n" + "add x19, x11, x10\n" + "fmla v20.8h, v5.8h, v12.8h\n" + "fmla v16.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 57f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 58f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 58f\n" + "57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End + "fmla v19.8h, v8.8h, v10.8h\n" + "add x19, x12, x9\n" + "fmla v18.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v6.8h, v10.8h\n" + "tbz %x[n_channels], #1, 59f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 60f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 60f\n" + "59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v22.8h, v8.8h, v11.8h\n" + "add x19, x11, x9\n" + "fmla v21.8h, v7.8h, v11.8h\n" + "fmla v20.8h, v6.8h, v11.8h\n" + "fmla v18.8h, v5.8h, v11.8h\n" + "fmla v17.8h, v4.8h, v11.8h\n" + "fmla v16.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 61f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 62f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 62f\n" + "61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End + "fmla v18.8h, v8.8h, v12.8h\n" + "add x19, x15, x6\n" + "fmla v17.8h, v7.8h, v12.8h\n" + "fmla v16.8h, v6.8h, v12.8h\n" + "tbz %x[n_channels], #1, 63f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 64f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 64f\n" + "63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End + "fmla v31.8h, v4.8h, v10.8h\n" + "add x19, x15, x28\n" + "fmla v30.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v1.8h, v10.8h\n" + "fmla v26.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 65f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 66f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 66f\n" + "65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v29.8h, v5.8h, v11.8h\n" + "add x19, x12, x6\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v25.8h, v2.8h, v11.8h\n" + "fmla v24.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 67f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 68f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 68f\n" + "67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v23.8h, v7.8h, v12.8h\n" + "add x19, x12, x28\n" + "fmla v22.8h, v6.8h, v12.8h\n" + "fmla v19.8h, v4.8h, v12.8h\n" + "fmla v18.8h, v3.8h, v12.8h\n" + "tbz %x[n_channels], #1, 69f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 70f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 70f\n" + "69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v21.8h, v8.8h, v10.8h\n" + "fmla v20.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v5.8h, v10.8h\n" + "fmla v16.8h, v4.8h, v10.8h\n" + "fmax v31.8h, v31.8h, v15.8h\n" + "fmax v30.8h, v30.8h, v15.8h\n" + "fmax v29.8h, v29.8h, v15.8h\n" + "fmin v31.8h, v31.8h, v14.8h\n" + "fmin v30.8h, v30.8h, v14.8h\n" + "fmin v29.8h, v29.8h, v14.8h\n" + "fmax v28.8h, v28.8h, v15.8h\n" + "fmax v27.8h, v27.8h, v15.8h\n" + "fmax v26.8h, v26.8h, v15.8h\n" + "fmin v28.8h, v28.8h, v14.8h\n" + "fmin v27.8h, v27.8h, v14.8h\n" + "fmin v26.8h, v26.8h, v14.8h\n" + "fmax v25.8h, v25.8h, v15.8h\n" + "fmax v24.8h, v24.8h, v15.8h\n" + "fmax v23.8h, v23.8h, v15.8h\n" + "fmin v25.8h, v25.8h, v14.8h\n" + "fmin v24.8h, v24.8h, v14.8h\n" + "fmin v23.8h, v23.8h, v14.8h\n" + "fmax v22.8h, v22.8h, v15.8h\n" + "fmax v21.8h, v21.8h, v15.8h\n" + "fmax v20.8h, v20.8h, v15.8h\n" + "fmin v22.8h, v22.8h, v14.8h\n" + "fmin v21.8h, v21.8h, v14.8h\n" + "fmin v20.8h, v20.8h, v14.8h\n" + "fmax v19.8h, v19.8h, v15.8h\n" + "fmax v18.8h, v18.8h, v15.8h\n" + "fmax v17.8h, v17.8h, v15.8h\n" + "fmin v19.8h, v19.8h, v14.8h\n" + "fmin v18.8h, v18.8h, v14.8h\n" + "fmin v17.8h, v17.8h, v14.8h\n" + "fmax v16.8h, v16.8h, v15.8h\n" + "fmin v16.8h, v16.8h, v14.8h\n" + "tbz %x[n_channels], #1, 71f\n" + "mov x19, x16\n" + "st1 { v31.s }[0], [x19], x17\n" + "add x16, x16, #0x4\n" + "st1 { v30.s }[0], [x19], x17\n" + "mov x21, x26\n" + "st1 { v29.s }[0], [x19], x17\n" + "st1 { v27.s }[0], [x21], x17\n" + "add x26, x26, #0x4\n" + "st1 { v28.s }[0], [x19]\n" + "mov x20, x25\n" + "st1 { v26.s }[0], [x21], x17\n" + "add x25, x25, #0x4\n" + "st1 { v25.s }[0], [x21], x17\n" + "mov x19, x24\n" + "st1 { v24.s }[0], [x21]\n" + "add x24, x24, #0x4\n" + "st1 { v23.s }[0], [x20], x17\n" + "st1 { v22.s }[0], [x20], x17\n" + "st1 { v21.s }[0], [x20], x17\n" + "st1 { v20.s }[0], [x20]\n" + "st1 { v19.s }[0], [x19], x17\n" + "st1 { v18.s }[0], [x19], x17\n" + "st1 { v17.s }[0], [x19], x17\n" + "st1 { v16.s }[0], [x19]\n" + "tbz %x[n_channels], #0, 72f\n" + "mov x22, x16\n" + "st1 { v31.h }[2], [x22], x17\n" + "mov x21, x26\n" + "st1 { v30.h }[2], [x22], x17\n" + "st1 { v27.h }[2], [x21], x17\n" + "mov x20, x25\n" + "st1 { v29.h }[2], [x22], x17\n" + "mov x19, x24\n" + "st1 { v28.h }[2], [x22]\n" + "st1 { v26.h }[2], [x21], x17\n" + "st1 { v25.h }[2], [x21], x17\n" + "st1 { v24.h }[2], [x21]\n" + "st1 { v23.h }[2], [x20], x17\n" + "st1 { v22.h }[2], [x20], x17\n" + "st1 { v21.h }[2], [x20], x17\n" + "st1 { v20.h }[2], [x20]\n" + "st1 { v19.h }[2], [x19], x17\n" + "st1 { v18.h }[2], [x19], x17\n" + "st1 { v17.h }[2], [x19], x17\n" + "st1 { v16.h }[2], [x19]\n" + "b 72f\n" + "71:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x22, x16\n" + "st1 { v31.h }[0], [x22], x17\n" + "mov x21, x26\n" + "mov x20, x25\n" + "st1 { v30.h }[0], [x22], x17\n" + "st1 { v27.h }[0], [x21], x17\n" + "mov x19, x24\n" + "st1 { v29.h }[0], [x22], x17\n" + "st1 { v28.h }[0], [x22]\n" + "st1 { v26.h }[0], [x21], x17\n" + "st1 { v25.h }[0], [x21], x17\n" + "st1 { v24.h }[0], [x21]\n" + "st1 { v23.h }[0], [x20], x17\n" + "st1 { v22.h }[0], [x20], x17\n" + "st1 { v21.h }[0], [x20], x17\n" + "st1 { v20.h }[0], [x20]\n" + "st1 { v19.h }[0], [x19], x17\n" + "st1 { v18.h }[0], [x19], x17\n" + "st1 { v17.h }[0], [x19], x17\n" + "st1 { v16.h }[0], [x19]\n" + "72:" // Tile loop: Oddments: Store: Bit 1: End + + "73:" // Tile loop: End + "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x4, #0x1\n" + "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x26, x26, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x26, x19\n" + "csel x26, x26, XZR, LT\n" + "csel x4, x4, x21, LT\n" + "cmp x4, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..40c019a36c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,1399 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[36]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[14]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[5]; + inptrs[3] = input_ptrs[15]; + inptrs[4] = input_ptrs[30]; + inptrs[5] = input_ptrs[35]; + inptrs[6] = input_ptrs[20]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[4]; + inptrs[9] = input_ptrs[21]; + inptrs[10] = input_ptrs[6]; + inptrs[11] = input_ptrs[11]; + inptrs[12] = input_ptrs[24]; + inptrs[13] = input_ptrs[8]; + inptrs[14] = input_ptrs[29]; + inptrs[15] = input_ptrs[9]; + inptrs[16] = input_ptrs[31]; + inptrs[17] = input_ptrs[13]; + inptrs[18] = input_ptrs[34]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[19]; + inptrs[22] = input_ptrs[3]; + inptrs[23] = input_ptrs[12]; + inptrs[24] = input_ptrs[22]; + inptrs[25] = input_ptrs[17]; + inptrs[26] = input_ptrs[18]; + inptrs[27] = input_ptrs[26]; + inptrs[28] = input_ptrs[23]; + inptrs[29] = input_ptrs[32]; + inptrs[30] = input_ptrs[27]; + inptrs[31] = input_ptrs[33]; + inptrs[32] = input_ptrs[7]; + inptrs[33] = input_ptrs[10]; + inptrs[34] = input_ptrs[25]; + inptrs[35] = input_ptrs[28]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v15.8h }, [x20]\n" + "ld1r { v14.8h }, [x19]\n" + "mov x14, #0x0\n" + "mov x13, #0x10\n" // cntb _, ALL, #1 + "sub x12, XZR, x13\n" + "lsr x11, %x[n_channels], #0x3\n" + "cbz x11, 3f\n" + "ldr q13, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x13, x11, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x10, x9, [x16, #0x0]\n" + "ldp x28, x27, [x16, #0x10]\n" + "ldr q9, [x10, x14]\n" + "ldr q10, [x9, x14]\n" + "ldr q11, [x28, x14]\n" + "ldr q12, [x27, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x26, [x16, #0x20]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "ldr x25, [x16, #0x28]\n" + "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "ldr x24, [x16, #0x30]\n" + "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n" + "ldr x23, [x16, #0x38]\n" + "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n" + "ldr x10, [x16, #0x40]\n" + "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n" + "ldr x9, [x16, #0x48]\n" + "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n" + "ldr x28, [x16, #0x50]\n" + "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n" + "ldr x27, [x16, #0x58]\n" + "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n" + "ldr q9, [x24, x14]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x26, x14]\n" + "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n" + "ldr q11, [x25, x14]\n" + "fmla v30.8h, v8.8h, v12.8h\n" + "ldr x26, [x16, #0x60]\n" + "fmla v29.8h, v7.8h, v12.8h\n" + "ldr x25, [x16, #0x68]\n" + "fmla v26.8h, v5.8h, v12.8h\n" + "ldr x24, [x16, #0x70]\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr x22, [x17, #0x0]\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "ldr x21, [x17, #0x8]\n" + "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n" + "ldr x20, [x17, #0x10]\n" + "fmla v22.8h, v2.8h, v12.8h\n" + "ldr x19, [x17, #0x18]\n" + "fmla v21.8h, v1.8h, v12.8h\n" + "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n" + "ldr q10, [x9, x14]\n" + "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n" + "ldr q11, [x10, x14]\n" + "fmla v27.8h, v8.8h, v9.8h\n" + "ldr x23, [x16, #0x78]\n" + "fmla v26.8h, v7.8h, v9.8h\n" + "ldr x10, [x16, #0x80]\n" + "fmla v25.8h, v6.8h, v9.8h\n" + "ldr x9, [x16, #0x88]\n" + "fmla v23.8h, v5.8h, v9.8h\n" + "fmla v22.8h, v4.8h, v9.8h\n" + "fmla v21.8h, v3.8h, v9.8h\n" + "fmla v19.8h, v2.8h, v9.8h\n" + "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n" + "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n" + "ldr q9, [x28, x14]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x27, x14]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr x27, [x16, #0x98]\n" + "fmla v28.8h, v1.8h, v11.8h\n" + "ldr q11, [x26, x14]\n" + "fmla v26.8h, v8.8h, v10.8h\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v25.8h, v7.8h, v10.8h\n" + "ldr q13, [x15, #0x0]\n" + "fmla v24.8h, v6.8h, v10.8h\n" + "fmla v22.8h, v5.8h, v10.8h\n" + "fmla v21.8h, v4.8h, v10.8h\n" + "fmla v20.8h, v3.8h, v10.8h\n" + "fmla v18.8h, v2.8h, v10.8h\n" + "fmla v17.8h, v1.8h, v10.8h\n" + "fmla v16.8h, v0.8h, v10.8h\n" + "ldr q10, [x25, x14]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v27.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v2.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "fmla v23.8h, v6.8h, v11.8h\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v19.8h, v3.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v31.8h, v5.8h, v10.8h\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v2.8h, v10.8h\n" + "fmla v26.8h, v1.8h, v10.8h\n" + "fmla v25.8h, v0.8h, v10.8h\n" + "ldr q10, [x9, x14]\n" + "fmla v20.8h, v8.8h, v11.8h\n" + "ldr x9, [x16, #0xc8]\n" + "fmla v16.8h, v5.8h, v11.8h\n" + "ldr q11, [x10, x14]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v26.8h, v2.8h, v12.8h\n" + "fmla v25.8h, v1.8h, v12.8h\n" + "fmla v24.8h, v0.8h, v12.8h\n" + "ldr q12, [x27, x14]\n" + "fmla v19.8h, v7.8h, v11.8h\n" + "ldr x27, [x16, #0xd8]\n" + "fmla v18.8h, v6.8h, v11.8h\n" + "ldr q11, [x28, x14]\n" + "fmla v31.8h, v7.8h, v10.8h\n" + "ldr x28, [x16, #0xd0]\n" + "fmla v30.8h, v6.8h, v10.8h\n" + "fmla v27.8h, v4.8h, v10.8h\n" + "fmla v26.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v22.8h, v0.8h, v10.8h\n" + "ldr q10, [x26, x14]\n" + "fmla v17.8h, v8.8h, v11.8h\n" + "ldr x26, [x16, #0xe0]\n" + "fmla v16.8h, v7.8h, v11.8h\n" + "ldr q11, [x25, x14]\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "ldr x25, [x16, #0xe8]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v25.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v4.8h, v12.8h\n" + "fmla v21.8h, v2.8h, v12.8h\n" + "fmla v20.8h, v1.8h, v12.8h\n" + "ldr q12, [x24, x14]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr x24, [x16, #0xf0]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v0.8h, v10.8h\n" + "ldr q10, [x23, x14]\n" + "fmla v27.8h, v7.8h, v11.8h\n" + "ldr x23, [x16, #0xf8]\n" + "fmla v26.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v4.8h, v11.8h\n" + "fmla v22.8h, v3.8h, v11.8h\n" + "fmla v19.8h, v1.8h, v11.8h\n" + "fmla v18.8h, v0.8h, v11.8h\n" + "ldr q11, [x10, x14]\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "ldr x10, [x16, #0x100]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr q12, [x9, x14]\n" + "fmla v31.8h, v6.8h, v10.8h\n" + "ldr x9, [x16, #0x108]\n" + "fmla v27.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v0.8h, v10.8h\n" + "ldr q10, [x28, x14]\n" + "fmla v25.8h, v8.8h, v11.8h\n" + "ldr x28, [x16, #0x110]\n" + "fmla v24.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v5.8h, v11.8h\n" + "fmla v20.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v2.8h, v11.8h\n" + "fmla v16.8h, v1.8h, v11.8h\n" + "ldr q11, [x27, x14]\n" + "fmla v28.8h, v8.8h, v12.8h\n" + "ldr x27, [x16, #0x118]\n" + "fmla v24.8h, v5.8h, v12.8h\n" + "fmla v20.8h, v2.8h, v12.8h\n" + "ldr q12, [x26, x14]\n" + "fmla v27.8h, v6.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v19.8h, v0.8h, v10.8h\n" + "ldr q10, [x25, x14]\n" + "fmla v22.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v8.8h, v11.8h\n" + "fmla v19.8h, v5.8h, v11.8h\n" + "fmla v18.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v3.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v24.8h, v8.8h, v12.8h\n" + "fmla v20.8h, v5.8h, v12.8h\n" + "fmla v16.8h, v2.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "fmla v19.8h, v8.8h, v10.8h\n" + "fmla v18.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v6.8h, v10.8h\n" + "ldr q10, [x10, x14]\n" + "fmla v22.8h, v8.8h, v11.8h\n" + "fmla v21.8h, v7.8h, v11.8h\n" + "fmla v20.8h, v6.8h, v11.8h\n" + "fmla v18.8h, v5.8h, v11.8h\n" + "fmla v17.8h, v4.8h, v11.8h\n" + "fmla v16.8h, v3.8h, v11.8h\n" + "ldr q11, [x9, x14]\n" + "fmla v31.8h, v4.8h, v10.8h\n" + "ldp x10, x9, [x16, #0x0]\n" + "fmla v18.8h, v8.8h, v12.8h\n" + "ldr q9, [x10, x13]\n" + "fmla v17.8h, v7.8h, v12.8h\n" + "fmla v16.8h, v6.8h, v12.8h\n" + "ldr q12, [x28, x14]\n" + "fmla v30.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v1.8h, v10.8h\n" + "fmla v26.8h, v0.8h, v10.8h\n" + "ldr q10, [x27, x14]\n" + "add x14, x14, #0x10\n" + "fmla v29.8h, v5.8h, v11.8h\n" + "ldp x28, x27, [x16, #0x10]\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "ldr q0, [x15, #0x10]\n" + "fmla v25.8h, v2.8h, v11.8h\n" + "ldr q2, [x15, #0x30]\n" + "fmla v24.8h, v1.8h, v11.8h\n" + "ldr q11, [x28, x13]\n" + "fmla v23.8h, v7.8h, v12.8h\n" + "ldr q1, [x15, #0x20]\n" + "fmla v22.8h, v6.8h, v12.8h\n" + "ldr q6, [x15, #0x70]\n" + "fmla v19.8h, v4.8h, v12.8h\n" + "fmla v18.8h, v3.8h, v12.8h\n" + "ldr q12, [x27, x13]\n" + "fmla v21.8h, v8.8h, v10.8h\n" + "ldr q3, [x15, #0x40]\n" + "fmla v20.8h, v7.8h, v10.8h\n" + "ldr q7, [x15, #0x80]\n" + "fmla v17.8h, v5.8h, v10.8h\n" + "ldr q5, [x15, #0x60]\n" + "fmla v16.8h, v4.8h, v10.8h\n" + "ldr q10, [x9, x13]\n" + "add x13, x13, #0x10\n" + "fmax v31.8h, v31.8h, v15.8h\n" + "ldr q4, [x15, #0x50]\n" + "cmp x13, x11, LSL #4\n" + "fmax v30.8h, v30.8h, v15.8h\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "fmax v29.8h, v29.8h, v15.8h\n" + "fmax v28.8h, v28.8h, v15.8h\n" + "fmin v31.8h, v31.8h, v14.8h\n" + "str q31, [x22, x12]\n" + "fmin v30.8h, v30.8h, v14.8h\n" + "fmin v29.8h, v29.8h, v14.8h\n" + "ldr x22, [x17, #0x20]\n" + "fmin v28.8h, v28.8h, v14.8h\n" + "str q30, [x21, x12]\n" + "fmax v27.8h, v27.8h, v15.8h\n" + "fmax v26.8h, v26.8h, v15.8h\n" + "str q29, [x20, x12]\n" + "fmax v25.8h, v25.8h, v15.8h\n" + "str q28, [x19, x12]\n" + "fmax v24.8h, v24.8h, v15.8h\n" + "ldr x21, [x17, #0x28]\n" + "fmin v27.8h, v27.8h, v14.8h\n" + "ldr x20, [x17, #0x30]\n" + "fmin v26.8h, v26.8h, v14.8h\n" + "ldr x19, [x17, #0x38]\n" + "fmin v25.8h, v25.8h, v14.8h\n" + "str q27, [x22, x12]\n" + "fmin v24.8h, v24.8h, v14.8h\n" + "str q26, [x21, x12]\n" + "fmax v23.8h, v23.8h, v15.8h\n" + "str q25, [x20, x12]\n" + "fmax v22.8h, v22.8h, v15.8h\n" + "str q24, [x19, x12]\n" + "fmax v21.8h, v21.8h, v15.8h\n" + "ldr x22, [x17, #0x40]\n" + "fmin v23.8h, v23.8h, v14.8h\n" + "ldr x21, [x17, #0x48]\n" + "fmin v22.8h, v22.8h, v14.8h\n" + "ldr x20, [x17, #0x50]\n" + "fmin v21.8h, v21.8h, v14.8h\n" + "str q23, [x22, x12]\n" + "fmax v20.8h, v20.8h, v15.8h\n" + "str q22, [x21, x12]\n" + "fmax v19.8h, v19.8h, v15.8h\n" + "str q21, [x20, x12]\n" + "fmax v18.8h, v18.8h, v15.8h\n" + "ldr x19, [x17, #0x58]\n" + "fmin v20.8h, v20.8h, v14.8h\n" + "ldr x22, [x17, #0x60]\n" + "fmin v19.8h, v19.8h, v14.8h\n" + "ldr x21, [x17, #0x68]\n" + "fmin v18.8h, v18.8h, v14.8h\n" + "str q20, [x19, x12]\n" + "fmax v17.8h, v17.8h, v15.8h\n" + "str q19, [x22, x12]\n" + "fmax v16.8h, v16.8h, v15.8h\n" + "str q18, [x21, x12]\n" + "ldr x20, [x17, #0x70]\n" + "fmin v17.8h, v17.8h, v14.8h\n" + "ldr x19, [x17, #0x78]\n" + "fmin v16.8h, v16.8h, v14.8h\n" + "str q17, [x20, x12]\n" + "str q16, [x19, x12]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x26, [x16, #0x20]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "ldr x25, [x16, #0x28]\n" + "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "ldr x24, [x16, #0x30]\n" + "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n" + "ldr x23, [x16, #0x38]\n" + "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n" + "ldr x10, [x16, #0x40]\n" + "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n" + "ldr x9, [x16, #0x48]\n" + "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n" + "ldr x28, [x16, #0x50]\n" + "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n" + "ldr x27, [x16, #0x58]\n" + "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n" + "ldr q9, [x24, x14]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr q10, [x26, x14]\n" + "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n" + "ldr q11, [x25, x14]\n" + "fmla v30.8h, v8.8h, v12.8h\n" + "ldr x26, [x16, #0x60]\n" + "fmla v29.8h, v7.8h, v12.8h\n" + "ldr x25, [x16, #0x68]\n" + "fmla v26.8h, v5.8h, v12.8h\n" + "ldr x24, [x16, #0x70]\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "ldr x22, [x17, #0x0]\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "ldr x21, [x17, #0x8]\n" + "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n" + "ldr x20, [x17, #0x10]\n" + "fmla v22.8h, v2.8h, v12.8h\n" + "ldr x19, [x17, #0x18]\n" + "fmla v21.8h, v1.8h, v12.8h\n" + "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n" + "ldr q10, [x9, x14]\n" + "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n" + "ldr q11, [x10, x14]\n" + "fmla v27.8h, v8.8h, v9.8h\n" + "ldr x23, [x16, #0x78]\n" + "fmla v26.8h, v7.8h, v9.8h\n" + "ldr x10, [x16, #0x80]\n" + "fmla v25.8h, v6.8h, v9.8h\n" + "ldr x9, [x16, #0x88]\n" + "fmla v23.8h, v5.8h, v9.8h\n" + "fmla v22.8h, v4.8h, v9.8h\n" + "fmla v21.8h, v3.8h, v9.8h\n" + "fmla v19.8h, v2.8h, v9.8h\n" + "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n" + "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n" + "ldr q9, [x28, x14]\n" + "fmla v31.8h, v1.8h, v12.8h\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "ldr q12, [x27, x14]\n" + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr x27, [x16, #0x98]\n" + "fmla v28.8h, v1.8h, v11.8h\n" + "ldr q11, [x26, x14]\n" + "fmla v26.8h, v8.8h, v10.8h\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v25.8h, v7.8h, v10.8h\n" + "fmla v24.8h, v6.8h, v10.8h\n" + "fmla v22.8h, v5.8h, v10.8h\n" + "fmla v21.8h, v4.8h, v10.8h\n" + "fmla v20.8h, v3.8h, v10.8h\n" + "fmla v18.8h, v2.8h, v10.8h\n" + "fmla v17.8h, v1.8h, v10.8h\n" + "fmla v16.8h, v0.8h, v10.8h\n" + "ldr q10, [x25, x14]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v27.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v2.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "fmla v23.8h, v6.8h, v11.8h\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v19.8h, v3.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v31.8h, v5.8h, v10.8h\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v2.8h, v10.8h\n" + "fmla v26.8h, v1.8h, v10.8h\n" + "fmla v25.8h, v0.8h, v10.8h\n" + "ldr q10, [x9, x14]\n" + "fmla v20.8h, v8.8h, v11.8h\n" + "ldr x9, [x16, #0xc8]\n" + "fmla v16.8h, v5.8h, v11.8h\n" + "ldr q11, [x10, x14]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v26.8h, v2.8h, v12.8h\n" + "fmla v25.8h, v1.8h, v12.8h\n" + "fmla v24.8h, v0.8h, v12.8h\n" + "ldr q12, [x27, x14]\n" + "fmla v19.8h, v7.8h, v11.8h\n" + "ldr x27, [x16, #0xd8]\n" + "fmla v18.8h, v6.8h, v11.8h\n" + "ldr q11, [x28, x14]\n" + "fmla v31.8h, v7.8h, v10.8h\n" + "ldr x28, [x16, #0xd0]\n" + "fmla v30.8h, v6.8h, v10.8h\n" + "fmla v27.8h, v4.8h, v10.8h\n" + "fmla v26.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v22.8h, v0.8h, v10.8h\n" + "ldr q10, [x26, x14]\n" + "fmla v17.8h, v8.8h, v11.8h\n" + "ldr x26, [x16, #0xe0]\n" + "fmla v16.8h, v7.8h, v11.8h\n" + "ldr q11, [x25, x14]\n" + "fmla v29.8h, v8.8h, v12.8h\n" + "ldr x25, [x16, #0xe8]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "fmla v25.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v4.8h, v12.8h\n" + "fmla v21.8h, v2.8h, v12.8h\n" + "fmla v20.8h, v1.8h, v12.8h\n" + "ldr q12, [x24, x14]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr x24, [x16, #0xf0]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v0.8h, v10.8h\n" + "ldr q10, [x23, x14]\n" + "fmla v27.8h, v7.8h, v11.8h\n" + "ldr x23, [x16, #0xf8]\n" + "fmla v26.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v4.8h, v11.8h\n" + "fmla v22.8h, v3.8h, v11.8h\n" + "fmla v19.8h, v1.8h, v11.8h\n" + "fmla v18.8h, v0.8h, v11.8h\n" + "ldr q11, [x10, x14]\n" + "fmla v30.8h, v2.8h, v12.8h\n" + "ldr x10, [x16, #0x100]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr q12, [x9, x14]\n" + "fmla v31.8h, v6.8h, v10.8h\n" + "ldr x9, [x16, #0x108]\n" + "fmla v27.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v0.8h, v10.8h\n" + "ldr q10, [x28, x14]\n" + "fmla v25.8h, v8.8h, v11.8h\n" + "ldr x28, [x16, #0x110]\n" + "fmla v24.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v5.8h, v11.8h\n" + "fmla v20.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v2.8h, v11.8h\n" + "fmla v16.8h, v1.8h, v11.8h\n" + "ldr q11, [x27, x14]\n" + "fmla v28.8h, v8.8h, v12.8h\n" + "ldr x27, [x16, #0x118]\n" + "fmla v24.8h, v5.8h, v12.8h\n" + "fmla v20.8h, v2.8h, v12.8h\n" + "ldr q12, [x26, x14]\n" + "fmla v27.8h, v6.8h, v10.8h\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "fmla v19.8h, v0.8h, v10.8h\n" + "ldr q10, [x25, x14]\n" + "fmla v22.8h, v7.8h, v11.8h\n" + "fmla v21.8h, v6.8h, v11.8h\n" + "fmla v23.8h, v8.8h, v11.8h\n" + "fmla v19.8h, v5.8h, v11.8h\n" + "fmla v18.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v3.8h, v11.8h\n" + "ldr q11, [x24, x14]\n" + "fmla v24.8h, v8.8h, v12.8h\n" + "fmla v20.8h, v5.8h, v12.8h\n" + "fmla v16.8h, v2.8h, v12.8h\n" + "ldr q12, [x23, x14]\n" + "fmla v19.8h, v8.8h, v10.8h\n" + "fmla v18.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v6.8h, v10.8h\n" + "ldr q10, [x10, x14]\n" + "fmla v22.8h, v8.8h, v11.8h\n" + "fmla v21.8h, v7.8h, v11.8h\n" + "fmla v20.8h, v6.8h, v11.8h\n" + "fmla v18.8h, v5.8h, v11.8h\n" + "fmla v17.8h, v4.8h, v11.8h\n" + "fmla v16.8h, v3.8h, v11.8h\n" + "ldr q11, [x9, x14]\n" + "fmla v31.8h, v4.8h, v10.8h\n" + "fmla v18.8h, v8.8h, v12.8h\n" + "fmla v17.8h, v7.8h, v12.8h\n" + "fmla v16.8h, v6.8h, v12.8h\n" + "ldr q12, [x28, x14]\n" + "fmla v30.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v1.8h, v10.8h\n" + "fmla v26.8h, v0.8h, v10.8h\n" + "ldr q10, [x27, x14]\n" + "add x14, x14, #0x10\n" + "fmla v29.8h, v5.8h, v11.8h\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "fmla v25.8h, v2.8h, v11.8h\n" + "fmla v24.8h, v1.8h, v11.8h\n" + "fmla v23.8h, v7.8h, v12.8h\n" + "fmla v22.8h, v6.8h, v12.8h\n" + "fmla v19.8h, v4.8h, v12.8h\n" + "fmla v18.8h, v3.8h, v12.8h\n" + "fmla v21.8h, v8.8h, v10.8h\n" + "fmla v20.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v5.8h, v10.8h\n" + "fmla v16.8h, v4.8h, v10.8h\n" + "fmax v31.8h, v31.8h, v15.8h\n" + "fmax v30.8h, v30.8h, v15.8h\n" + "fmax v29.8h, v29.8h, v15.8h\n" + "fmin v31.8h, v31.8h, v14.8h\n" + "str q31, [x22, x12]\n" + "fmin v30.8h, v30.8h, v14.8h\n" + "fmin v29.8h, v29.8h, v14.8h\n" + "ldr x22, [x17, #0x20]\n" + "fmax v28.8h, v28.8h, v15.8h\n" + "str q30, [x21, x12]\n" + "fmax v27.8h, v27.8h, v15.8h\n" + "fmax v26.8h, v26.8h, v15.8h\n" + "str q29, [x20, x12]\n" + "fmin v28.8h, v28.8h, v14.8h\n" + "ldr x21, [x17, #0x28]\n" + "fmax v25.8h, v25.8h, v15.8h\n" + "ldr x20, [x17, #0x30]\n" + "fmin v27.8h, v27.8h, v14.8h\n" + "str q28, [x19, x12]\n" + "fmin v26.8h, v26.8h, v14.8h\n" + "ldr x19, [x17, #0x38]\n" + "fmin v25.8h, v25.8h, v14.8h\n" + "str q27, [x22, x12]\n" + "fmax v24.8h, v24.8h, v15.8h\n" + "str q26, [x21, x12]\n" + "fmax v23.8h, v23.8h, v15.8h\n" + "str q25, [x20, x12]\n" + "fmax v22.8h, v22.8h, v15.8h\n" + "ldr x22, [x17, #0x40]\n" + "fmin v24.8h, v24.8h, v14.8h\n" + "ldr x21, [x17, #0x48]\n" + "fmin v23.8h, v23.8h, v14.8h\n" + "ldr x20, [x17, #0x50]\n" + "fmin v22.8h, v22.8h, v14.8h\n" + "str q24, [x19, x12]\n" + "fmax v21.8h, v21.8h, v15.8h\n" + "str q23, [x22, x12]\n" + "fmax v20.8h, v20.8h, v15.8h\n" + "str q22, [x21, x12]\n" + "fmax v19.8h, v19.8h, v15.8h\n" + "ldr x19, [x17, #0x58]\n" + "fmin v21.8h, v21.8h, v14.8h\n" + "ldr x22, [x17, #0x60]\n" + "fmin v20.8h, v20.8h, v14.8h\n" + "ldr x21, [x17, #0x68]\n" + "fmin v19.8h, v19.8h, v14.8h\n" + "str q21, [x20, x12]\n" + "fmax v18.8h, v18.8h, v15.8h\n" + "str q20, [x19, x12]\n" + "fmax v17.8h, v17.8h, v15.8h\n" + "str q19, [x22, x12]\n" + "fmax v16.8h, v16.8h, v15.8h\n" + "ldr x20, [x17, #0x70]\n" + "fmin v18.8h, v18.8h, v14.8h\n" + "ldr x19, [x17, #0x78]\n" + "fmin v17.8h, v17.8h, v14.8h\n" + "str q18, [x21, x12]\n" + "fmin v16.8h, v16.8h, v14.8h\n" + "str q17, [x20, x12]\n" + "str q16, [x19, x12]\n" + "3:" // Oddments + "tst %x[n_channels], #0x1\n" + "beq 72f\n" + "ldr q13, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x12, x14\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x10, [x16, #0x0]\n" + "add x10, x10, x14\n" + "ldr x9, [x16, #0x8]\n" + "ldr x28, [x16, #0x10]\n" + "add x9, x9, x14\n" + "ldr x27, [x16, #0x18]\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.s }[0], [x10], #0x4\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "ld1 { v11.s }[0], [x28], #0x4\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.h }[2], [x10], #0x2\n" + "ld1 { v10.h }[2], [x9], #0x2\n" + "ld1 { v11.h }[2], [x28], #0x2\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "b 5f\n" + "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset + "ld1 { v9.h }[0], [x10], #0x2\n" + "ld1 { v10.h }[0], [x9], #0x2\n" + "ld1 { v11.h }[0], [x28], #0x2\n" + "ld1 { v12.h }[0], [x27], #0x2\n" + "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End + "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x26, [x16, #0x20]\n" + "add x26, x26, x14\n" + "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n" + "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n" + "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n" + "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n" + "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n" + "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n" + "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n" + "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n" + "fmla v30.8h, v8.8h, v12.8h\n" + "fmla v29.8h, v7.8h, v12.8h\n" + "fmla v26.8h, v5.8h, v12.8h\n" + "fmla v28.8h, v6.8h, v12.8h\n" + "fmla v25.8h, v4.8h, v12.8h\n" + "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n" + "fmla v22.8h, v2.8h, v12.8h\n" + "fmla v21.8h, v1.8h, v12.8h\n" + "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v10.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v10.h }[2], [x26], #0x2\n" + "b 7f\n" + "6:" // Oddments: Load input (5, 0): Bit 1: Unset + "ld1 { v10.h }[0], [x26], #0x2\n" + "7:" // Oddments: Load input (5, 0): Bit 1: End + "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n" + "ldr x25, [x16, #0x28]\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v11.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v11.h }[2], [x25], #0x2\n" + "b 9f\n" + "8:" // Oddments: Load input (5, 5): Bit 1: Unset + "ld1 { v11.h }[0], [x25], #0x2\n" + "9:" // Oddments: Load input (5, 5): Bit 1: End + "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n" + "ldr x24, [x16, #0x30]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v9.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v9.h }[2], [x24], #0x2\n" + "b 11f\n" + "10:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v9.h }[0], [x24], #0x2\n" + "11:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v27.8h, v8.8h, v9.8h\n" + "ldr x23, [x16, #0x38]\n" + "fmla v26.8h, v7.8h, v9.8h\n" + "add x23, x23, x14\n" + "fmla v25.8h, v6.8h, v9.8h\n" + "fmla v23.8h, v5.8h, v9.8h\n" + "fmla v22.8h, v4.8h, v9.8h\n" + "fmla v21.8h, v3.8h, v9.8h\n" + "fmla v19.8h, v2.8h, v9.8h\n" + "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n" + "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v12.h }[2], [x23], #0x2\n" + "b 13f\n" + "12:" // Oddments: Load input (0, 1): Bit 1: Unset + "ld1 { v12.h }[0], [x23], #0x2\n" + "13:" // Oddments: Load input (0, 1): Bit 1: End + "fmla v31.8h, v1.8h, v12.8h\n" + "ldr x10, [x16, #0x40]\n" + "fmla v30.8h, v0.8h, v12.8h\n" + "add x10, x10, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v11.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v11.h }[2], [x10], #0x2\n" + "b 15f\n" + "14:" // Oddments: Load input (0, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x10], #0x2\n" + "15:" // Oddments: Load input (0, 4): Bit 1: End + "fmla v29.8h, v2.8h, v11.8h\n" + "ldr x9, [x16, #0x48]\n" + "fmla v28.8h, v1.8h, v11.8h\n" + "add x9, x9, x14\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v10.h }[2], [x9], #0x2\n" + "b 17f\n" + "16:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v10.h }[0], [x9], #0x2\n" + "17:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v26.8h, v8.8h, v10.8h\n" + "ldr x28, [x16, #0x50]\n" + "fmla v25.8h, v7.8h, v10.8h\n" + "add x28, x28, x14\n" + "fmla v24.8h, v6.8h, v10.8h\n" + "fmla v22.8h, v5.8h, v10.8h\n" + "fmla v21.8h, v4.8h, v10.8h\n" + "fmla v20.8h, v3.8h, v10.8h\n" + "fmla v18.8h, v2.8h, v10.8h\n" + "fmla v17.8h, v1.8h, v10.8h\n" + "fmla v16.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v9.s }[0], [x28], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v9.h }[2], [x28], #0x2\n" + "b 19f\n" + "18:" // Oddments: Load input (1, 0): Bit 1: Unset + "ld1 { v9.h }[0], [x28], #0x2\n" + "19:" // Oddments: Load input (1, 0): Bit 1: End + "fmla v31.8h, v3.8h, v9.8h\n" + "ldr x27, [x16, #0x58]\n" + "fmla v27.8h, v0.8h, v9.8h\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "b 21f\n" + "20:" // Oddments: Load input (1, 5): Bit 1: Unset + "ld1 { v12.h }[0], [x27], #0x2\n" + "21:" // Oddments: Load input (1, 5): Bit 1: End + "fmla v28.8h, v5.8h, v12.8h\n" + "ldr x26, [x16, #0x60]\n" + "fmla v24.8h, v2.8h, v12.8h\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v11.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v11.h }[2], [x26], #0x2\n" + "b 23f\n" + "22:" // Oddments: Load input (4, 0): Bit 1: Unset + "ld1 { v11.h }[0], [x26], #0x2\n" + "23:" // Oddments: Load input (4, 0): Bit 1: End + "fmla v23.8h, v6.8h, v11.8h\n" + "ldr x25, [x16, #0x68]\n" + "fmla v19.8h, v3.8h, v11.8h\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v10.h }[2], [x25], #0x2\n" + "b 25f\n" + "24:" // Oddments: Load input (1, 2): Bit 1: Unset + "ld1 { v10.h }[0], [x25], #0x2\n" + "25:" // Oddments: Load input (1, 2): Bit 1: End + "fmla v31.8h, v5.8h, v10.8h\n" + "ldr x24, [x16, #0x70]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "add x24, x24, x14\n" + "fmla v29.8h, v3.8h, v10.8h\n" + "fmla v27.8h, v2.8h, v10.8h\n" + "fmla v26.8h, v1.8h, v10.8h\n" + "fmla v25.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "b 27f\n" + "26:" // Oddments: Load input (4, 5): Bit 1: Unset + "ld1 { v11.h }[0], [x24], #0x2\n" + "27:" // Oddments: Load input (4, 5): Bit 1: End + "fmla v20.8h, v8.8h, v11.8h\n" + "ldr x23, [x16, #0x78]\n" + "fmla v16.8h, v5.8h, v11.8h\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v12.h }[2], [x23], #0x2\n" + "b 29f\n" + "28:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v12.h }[0], [x23], #0x2\n" + "29:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr x10, [x16, #0x80]\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "add x10, x10, x14\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v26.8h, v2.8h, v12.8h\n" + "fmla v25.8h, v1.8h, v12.8h\n" + "fmla v24.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v11.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v11.h }[2], [x10], #0x2\n" + "b 31f\n" + "30:" // Oddments: Load input (5, 1): Bit 1: Unset + "ld1 { v11.h }[0], [x10], #0x2\n" + "31:" // Oddments: Load input (5, 1): Bit 1: End + "fmla v19.8h, v7.8h, v11.8h\n" + "ldr x9, [x16, #0x88]\n" + "fmla v18.8h, v6.8h, v11.8h\n" + "add x9, x9, x14\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v10.h }[2], [x9], #0x2\n" + "b 33f\n" + "32:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v10.h }[0], [x9], #0x2\n" + "33:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v31.8h, v7.8h, v10.8h\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.8h, v6.8h, v10.8h\n" + "add x28, x28, x14\n" + "fmla v27.8h, v4.8h, v10.8h\n" + "fmla v26.8h, v3.8h, v10.8h\n" + "fmla v23.8h, v1.8h, v10.8h\n" + "fmla v22.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v11.s }[0], [x28], #0x4\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v11.h }[2], [x28], #0x2\n" + "b 35f\n" + "34:" // Oddments: Load input (5, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x28], #0x2\n" + "35:" // Oddments: Load input (5, 4): Bit 1: End + "fmla v17.8h, v8.8h, v11.8h\n" + "ldr x27, [x16, #0x98]\n" + "fmla v16.8h, v7.8h, v11.8h\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "b 37f\n" + "36:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v12.h }[0], [x27], #0x2\n" + "37:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v29.8h, v8.8h, v12.8h\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v28.8h, v7.8h, v12.8h\n" + "add x26, x26, x14\n" + "fmla v25.8h, v5.8h, v12.8h\n" + "fmla v24.8h, v4.8h, v12.8h\n" + "fmla v21.8h, v2.8h, v12.8h\n" + "fmla v20.8h, v1.8h, v12.8h\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v10.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v10.h }[2], [x26], #0x2\n" + "b 39f\n" + "38:" // Oddments: Load input (0, 2): Bit 1: Unset + "ld1 { v10.h }[0], [x26], #0x2\n" + "39:" // Oddments: Load input (0, 2): Bit 1: End + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "add x25, x25, x14\n" + "fmla v29.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 40f\n" + "ld1 { v11.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 41f\n" + "ld1 { v11.h }[2], [x25], #0x2\n" + "b 41f\n" + "40:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v11.h }[0], [x25], #0x2\n" + "41:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v27.8h, v7.8h, v11.8h\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.8h, v6.8h, v11.8h\n" + "add x24, x24, x14\n" + "fmla v23.8h, v4.8h, v11.8h\n" + "fmla v22.8h, v3.8h, v11.8h\n" + "fmla v19.8h, v1.8h, v11.8h\n" + "fmla v18.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 42f\n" + "ld1 { v12.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 43f\n" + "ld1 { v12.h }[2], [x24], #0x2\n" + "b 43f\n" + "42:" // Oddments: Load input (0, 3): Bit 1: Unset + "ld1 { v12.h }[0], [x24], #0x2\n" + "43:" // Oddments: Load input (0, 3): Bit 1: End + "fmla v30.8h, v2.8h, v12.8h\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "add x23, x23, x14\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "tbz %x[n_channels], #1, 44f\n" + "ld1 { v10.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 45f\n" + "ld1 { v10.h }[2], [x23], #0x2\n" + "b 45f\n" + "44:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v10.h }[0], [x23], #0x2\n" + "45:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v10.8h\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v27.8h, v3.8h, v10.8h\n" + "add x10, x10, x14\n" + "fmla v23.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 46f\n" + "ld1 { v11.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 47f\n" + "ld1 { v11.h }[2], [x10], #0x2\n" + "b 47f\n" + "46:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x10], #0x2\n" + "47:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v25.8h, v8.8h, v11.8h\n" + "ldr x9, [x16, #0xc8]\n" + "fmla v24.8h, v7.8h, v11.8h\n" + "add x9, x9, x14\n" + "fmla v21.8h, v5.8h, v11.8h\n" + "fmla v20.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v2.8h, v11.8h\n" + "fmla v16.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 48f\n" + "ld1 { v12.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 49f\n" + "ld1 { v12.h }[2], [x9], #0x2\n" + "b 49f\n" + "48:" // Oddments: Load input (2, 5): Bit 1: Unset + "ld1 { v12.h }[0], [x9], #0x2\n" + "49:" // Oddments: Load input (2, 5): Bit 1: End + "fmla v28.8h, v8.8h, v12.8h\n" + "ldr x28, [x16, #0xd0]\n" + "fmla v24.8h, v5.8h, v12.8h\n" + "add x28, x28, x14\n" + "fmla v20.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 50f\n" + "ld1 { v10.s }[0], [x28], #0x4\n" + "tbz %x[n_channels], #0, 51f\n" + "ld1 { v10.h }[2], [x28], #0x2\n" + "b 51f\n" + "50:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v10.h }[0], [x28], #0x2\n" + "51:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v27.8h, v6.8h, v10.8h\n" + "ldr x27, [x16, #0xd8]\n" + "fmla v23.8h, v3.8h, v10.8h\n" + "add x27, x27, x14\n" + "fmla v19.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 52f\n" + "ld1 { v11.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 53f\n" + "ld1 { v11.h }[2], [x27], #0x2\n" + "b 53f\n" + "52:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v11.h }[0], [x27], #0x2\n" + "53:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v23.8h, v8.8h, v11.8h\n" + "ldr x26, [x16, #0xe0]\n" + "fmla v22.8h, v7.8h, v11.8h\n" + "add x26, x26, x14\n" + "fmla v21.8h, v6.8h, v11.8h\n" + "fmla v19.8h, v5.8h, v11.8h\n" + "fmla v18.8h, v4.8h, v11.8h\n" + "fmla v17.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 54f\n" + "ld1 { v12.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 55f\n" + "ld1 { v12.h }[2], [x26], #0x2\n" + "b 55f\n" + "54:" // Oddments: Load input (3, 5): Bit 1: Unset + "ld1 { v12.h }[0], [x26], #0x2\n" + "55:" // Oddments: Load input (3, 5): Bit 1: End + "fmla v24.8h, v8.8h, v12.8h\n" + "ldr x25, [x16, #0xe8]\n" + "fmla v20.8h, v5.8h, v12.8h\n" + "add x25, x25, x14\n" + "fmla v16.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 56f\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 57f\n" + "ld1 { v10.h }[2], [x25], #0x2\n" + "b 57f\n" + "56:" // Oddments: Load input (5, 2): Bit 1: Unset + "ld1 { v10.h }[0], [x25], #0x2\n" + "57:" // Oddments: Load input (5, 2): Bit 1: End + "fmla v19.8h, v8.8h, v10.8h\n" + "ldr x24, [x16, #0xf0]\n" + "fmla v18.8h, v7.8h, v10.8h\n" + "add x24, x24, x14\n" + "fmla v17.8h, v6.8h, v10.8h\n" + "tbz %x[n_channels], #1, 58f\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 59f\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "b 59f\n" + "58:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v11.h }[0], [x24], #0x2\n" + "59:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v22.8h, v8.8h, v11.8h\n" + "ldr x23, [x16, #0xf8]\n" + "fmla v21.8h, v7.8h, v11.8h\n" + "add x23, x23, x14\n" + "fmla v20.8h, v6.8h, v11.8h\n" + "fmla v18.8h, v5.8h, v11.8h\n" + "fmla v17.8h, v4.8h, v11.8h\n" + "fmla v16.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 60f\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 61f\n" + "ld1 { v12.h }[2], [x23], #0x2\n" + "b 61f\n" + "60:" // Oddments: Load input (5, 3): Bit 1: Unset + "ld1 { v12.h }[0], [x23], #0x2\n" + "61:" // Oddments: Load input (5, 3): Bit 1: End + "fmla v18.8h, v8.8h, v12.8h\n" + "ldr x10, [x16, #0x100]\n" + "fmla v17.8h, v7.8h, v12.8h\n" + "add x10, x10, x14\n" + "fmla v16.8h, v6.8h, v12.8h\n" + "tbz %x[n_channels], #1, 62f\n" + "ld1 { v10.s }[0], [x10], #0x4\n" + "tbz %x[n_channels], #0, 63f\n" + "ld1 { v10.h }[2], [x10], #0x2\n" + "b 63f\n" + "62:" // Oddments: Load input (1, 1): Bit 1: Unset + "ld1 { v10.h }[0], [x10], #0x2\n" + "63:" // Oddments: Load input (1, 1): Bit 1: End + "fmla v31.8h, v4.8h, v10.8h\n" + "ldr x9, [x16, #0x108]\n" + "fmla v30.8h, v3.8h, v10.8h\n" + "add x9, x9, x14\n" + "fmla v27.8h, v1.8h, v10.8h\n" + "fmla v26.8h, v0.8h, v10.8h\n" + "tbz %x[n_channels], #1, 64f\n" + "ld1 { v11.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 65f\n" + "ld1 { v11.h }[2], [x9], #0x2\n" + "b 65f\n" + "64:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x9], #0x2\n" + "65:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v29.8h, v5.8h, v11.8h\n" + "ldr x28, [x16, #0x110]\n" + "fmla v28.8h, v4.8h, v11.8h\n" + "add x28, x28, x14\n" + "fmla v25.8h, v2.8h, v11.8h\n" + "fmla v24.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 66f\n" + "ld1 { v12.s }[0], [x28], #0x4\n" + "tbz %x[n_channels], #0, 67f\n" + "ld1 { v12.h }[2], [x28], #0x2\n" + "b 67f\n" + "66:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v12.h }[0], [x28], #0x2\n" + "67:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v23.8h, v7.8h, v12.8h\n" + "ldr x27, [x16, #0x118]\n" + "fmla v22.8h, v6.8h, v12.8h\n" + "add x27, x27, x14\n" + "fmla v19.8h, v4.8h, v12.8h\n" + "fmla v18.8h, v3.8h, v12.8h\n" + "tbz %x[n_channels], #1, 68f\n" + "ld1 { v10.s }[0], [x27], #0x4\n" + "tbz %x[n_channels], #0, 69f\n" + "ld1 { v10.h }[2], [x27], #0x2\n" + "b 69f\n" + "68:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v10.h }[0], [x27], #0x2\n" + "69:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v21.8h, v8.8h, v10.8h\n" + "fmla v20.8h, v7.8h, v10.8h\n" + "fmla v17.8h, v5.8h, v10.8h\n" + "fmla v16.8h, v4.8h, v10.8h\n" + "fmax v31.8h, v31.8h, v15.8h\n" + "fmax v30.8h, v30.8h, v15.8h\n" + "fmax v29.8h, v29.8h, v15.8h\n" + "fmin v31.8h, v31.8h, v14.8h\n" + "fmin v30.8h, v30.8h, v14.8h\n" + "fmin v29.8h, v29.8h, v14.8h\n" + "fmax v28.8h, v28.8h, v15.8h\n" + "fmax v27.8h, v27.8h, v15.8h\n" + "fmax v26.8h, v26.8h, v15.8h\n" + "fmin v28.8h, v28.8h, v14.8h\n" + "fmin v27.8h, v27.8h, v14.8h\n" + "fmin v26.8h, v26.8h, v14.8h\n" + "fmax v25.8h, v25.8h, v15.8h\n" + "fmax v24.8h, v24.8h, v15.8h\n" + "fmax v23.8h, v23.8h, v15.8h\n" + "fmin v25.8h, v25.8h, v14.8h\n" + "fmin v24.8h, v24.8h, v14.8h\n" + "fmin v23.8h, v23.8h, v14.8h\n" + "fmax v22.8h, v22.8h, v15.8h\n" + "fmax v21.8h, v21.8h, v15.8h\n" + "fmax v20.8h, v20.8h, v15.8h\n" + "fmin v22.8h, v22.8h, v14.8h\n" + "fmin v21.8h, v21.8h, v14.8h\n" + "fmin v20.8h, v20.8h, v14.8h\n" + "fmax v19.8h, v19.8h, v15.8h\n" + "fmax v18.8h, v18.8h, v15.8h\n" + "fmax v17.8h, v17.8h, v15.8h\n" + "fmin v19.8h, v19.8h, v14.8h\n" + "fmin v18.8h, v18.8h, v14.8h\n" + "fmin v17.8h, v17.8h, v14.8h\n" + "fmax v16.8h, v16.8h, v15.8h\n" + "fmin v16.8h, v16.8h, v14.8h\n" + "tbz %x[n_channels], #1, 70f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.s }[0], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.s }[0], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.s }[0], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.s }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.s }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.s }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.s }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.s }[0], [x19]\n" + "ldr x21, [x17, #0x48]\n" + "add x21, x21, x12\n" + "st1 { v23.s }[0], [x22]\n" + "ldr x20, [x17, #0x50]\n" + "add x20, x20, x12\n" + "st1 { v22.s }[0], [x21]\n" + "ldr x19, [x17, #0x58]\n" + "add x19, x19, x12\n" + "st1 { v21.s }[0], [x20]\n" + "ldr x22, [x17, #0x60]\n" + "add x22, x22, x12\n" + "st1 { v20.s }[0], [x19]\n" + "ldr x21, [x17, #0x68]\n" + "add x21, x21, x12\n" + "st1 { v19.s }[0], [x22]\n" + "ldr x20, [x17, #0x70]\n" + "add x20, x20, x12\n" + "st1 { v18.s }[0], [x21]\n" + "ldr x19, [x17, #0x78]\n" + "add x19, x19, x12\n" + "st1 { v17.s }[0], [x20]\n" + "add x12, x12, #0x4\n" + "st1 { v16.s }[0], [x19]\n" + "tbz %x[n_channels], #0, 71f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.h }[2], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.h }[2], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.h }[2], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.h }[2], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.h }[2], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.h }[2], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.h }[2], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.h }[2], [x19]\n" + "ldr x21, [x17, #0x48]\n" + "add x21, x21, x12\n" + "st1 { v23.h }[2], [x22]\n" + "ldr x20, [x17, #0x50]\n" + "add x20, x20, x12\n" + "st1 { v22.h }[2], [x21]\n" + "ldr x19, [x17, #0x58]\n" + "add x19, x19, x12\n" + "st1 { v21.h }[2], [x20]\n" + "ldr x22, [x17, #0x60]\n" + "add x22, x22, x12\n" + "st1 { v20.h }[2], [x19]\n" + "ldr x21, [x17, #0x68]\n" + "add x21, x21, x12\n" + "st1 { v19.h }[2], [x22]\n" + "ldr x20, [x17, #0x70]\n" + "add x20, x20, x12\n" + "st1 { v18.h }[2], [x21]\n" + "ldr x19, [x17, #0x78]\n" + "add x19, x19, x12\n" + "st1 { v17.h }[2], [x20]\n" + "st1 { v16.h }[2], [x19]\n" + "b 71f\n" + "70:" // Oddments: Store: Bit 1: Unset + "ldr x22, [x17, #0x0]\n" + "add x22, x22, x12\n" + "ldr x21, [x17, #0x8]\n" + "ldr x20, [x17, #0x10]\n" + "add x21, x21, x12\n" + "st1 { v31.h }[0], [x22]\n" + "ldr x19, [x17, #0x18]\n" + "add x20, x20, x12\n" + "st1 { v30.h }[0], [x21]\n" + "add x19, x19, x12\n" + "st1 { v29.h }[0], [x20]\n" + "ldr x22, [x17, #0x20]\n" + "add x22, x22, x12\n" + "st1 { v28.h }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.h }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.h }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.h }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.h }[0], [x19]\n" + "ldr x21, [x17, #0x48]\n" + "add x21, x21, x12\n" + "st1 { v23.h }[0], [x22]\n" + "ldr x20, [x17, #0x50]\n" + "add x20, x20, x12\n" + "st1 { v22.h }[0], [x21]\n" + "ldr x19, [x17, #0x58]\n" + "add x19, x19, x12\n" + "st1 { v21.h }[0], [x20]\n" + "ldr x22, [x17, #0x60]\n" + "add x22, x22, x12\n" + "st1 { v20.h }[0], [x19]\n" + "ldr x21, [x17, #0x68]\n" + "add x21, x21, x12\n" + "st1 { v19.h }[0], [x22]\n" + "ldr x20, [x17, #0x70]\n" + "add x20, x20, x12\n" + "st1 { v18.h }[0], [x21]\n" + "ldr x19, [x17, #0x78]\n" + "add x19, x19, x12\n" + "st1 { v17.h }[0], [x20]\n" + "st1 { v16.h }[0], [x19]\n" + "71:" // Oddments: Store: Bit 1: End + + "72:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..ca367cc1af --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + + a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..32a6fb964c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x6, #0x0\n" + "mov x27, #0x0\n" + "1:" // Tile loop + "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x26, #0x4\n" + "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x25, #0x2\n" + "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x22, #0x0\n" + "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x6, x23\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col + "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x26\n" // offset *= kernel_stride * output_size + "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x17, x17, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1r { v19.8h }, [x24]\n" + "add x14, x17, x23, LSL #1\n" + "ld1r { v18.8h }, [x21]\n" + "add x13, x14, x23, LSL #1\n" + "lsl x8, x8, #0x1\n" + "add x12, x13, x23, LSL #1\n" + "add x11, x12, x23, LSL #1\n" + "add x10, x8, x8\n" + "add x9, x10, x8\n" + "add x28, x9, x8\n" + "mul x19, x6, x20\n" // offset = tile_i * ld_output_row + "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x25\n" // offset *= output_tile_size + "add x15, x15, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x27, x15, x20, LSL #1\n" + "lsl x16, x16, #0x1\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x3\n" + "cbz x19, 4f\n" + "ldr q17, [x7, #0x0]\n" + "ldr q0, [x7, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x7, #0x20]\n" + "ldr q2, [x7, #0x30]\n" + "ldr q3, [x7, #0x40]\n" + "ldr q4, [x7, #0x50]\n" + "ldr q5, [x7, #0x60]\n" + "ldr q6, [x7, #0x70]\n" + "ldr q7, [x7, #0x80]\n" + "ldr q8, [x7, #0x90]\n" + "add x7, x7, #0xa0\n" + "ldr q9, [x13, x10]\n" + "ld1 { v10.8h }, [x17]\n" + "ldr q11, [x17, x8]\n" + "ldr q12, [x17, x9]\n" + "ldr q13, [x17, x28]\n" + "ld1 { v14.8h }, [x14]\n" + "ldr q15, [x14, x8]\n" + "ldr q16, [x17, x10]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n" + "add x22, x22, #0x10\n" + "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n" + "add x17, x17, #0x10\n" + "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ldr q17, [x7, #0x0]\n" + "add x21, x21, #0x10\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ld1 { v10.8h }, [x17]\n" + "cmp x21, x19, LSL #4\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "ldr q12, [x14, x28]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x14, x9]\n" + "fmla v30.8h, v2.8h, v13.8h\n" + "ldr q13, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v3.8h, v14.8h\n" + "ld1 { v14.8h }, [x12]\n" + "fmla v30.8h, v0.8h, v16.8h\n" + "fmla v31.8h, v4.8h, v15.8h\n" + "ld1 { v15.8h }, [x13]\n" + "fmla v29.8h, v3.8h, v14.8h\n" + "ldr q14, [x12, x28]\n" + "fmla v30.8h, v4.8h, v11.8h\n" + "ldr q11, [x12, x8]\n" + "fmla v31.8h, v2.8h, v16.8h\n" + "ldr q16, [x13, x8]\n" + "fmla v29.8h, v0.8h, v15.8h\n" + "ldr q0, [x7, #0x10]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x13, x9]\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "ldr q11, [x13, x28]\n" + "add x13, x13, #0x10\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "ldr q13, [x12, x9]\n" + "ldr q9, [x13, x10]\n" + "fmla v31.8h, v6.8h, v15.8h\n" + "ld1 { v15.8h }, [x11]\n" + "fmla v29.8h, v1.8h, v16.8h\n" + "fmla v28.8h, v4.8h, v13.8h\n" + "ldr q13, [x11, x8]\n" + "fmla v30.8h, v7.8h, v12.8h\n" + "ldr q4, [x7, #0x50]\n" + "fmla v31.8h, v7.8h, v16.8h\n" + "ldr q16, [x12, x10]\n" + "add x12, x12, #0x10\n" + "fmla v29.8h, v6.8h, v15.8h\n" + "ldr q15, [x11, x10]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q12, [x17, x9]\n" + "fmla v30.8h, v8.8h, v11.8h\n" + "ldr q1, [x7, #0x20]\n" + "fmax v31.8h, v31.8h, v19.8h\n" + "fmla v29.8h, v7.8h, v13.8h\n" + "ldr q13, [x17, x28]\n" + "fmla v28.8h, v5.8h, v14.8h\n" + "ldr q14, [x11, x9]\n" + "fmax v30.8h, v30.8h, v19.8h\n" + "fmin v31.8h, v31.8h, v18.8h\n" + "st1 { v31.8h }, [x15]\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v5.8h, v16.8h\n" + "ldr q11, [x11, x28]\n" + "add x11, x11, #0x10\n" + "fmin v30.8h, v30.8h, v18.8h\n" + "ldr q2, [x7, #0x30]\n" + "ldr q5, [x7, #0x60]\n" + "fmla v28.8h, v3.8h, v16.8h\n" + "ldr q16, [x17, x10]\n" + "fmla v29.8h, v8.8h, v15.8h\n" + "str q30, [x15, x16]\n" + "add x15, x15, #0x10\n" + "fmla v28.8h, v7.8h, v14.8h\n" + "ld1 { v14.8h }, [x14]\n" + "fmax v29.8h, v29.8h, v19.8h\n" + "ldr q3, [x7, #0x40]\n" + "ldr q7, [x7, #0x80]\n" + "fmin v29.8h, v29.8h, v18.8h\n" + "st1 { v29.8h }, [x27]\n" + "fmla v28.8h, v6.8h, v15.8h\n" + "ldr q15, [x14, x8]\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "ldr q11, [x17, x8]\n" + "ldr q6, [x7, #0x70]\n" + "fmax v28.8h, v28.8h, v19.8h\n" + "ldr q8, [x7, #0x90]\n" + "add x7, x7, #0xa0\n" + "fmin v28.8h, v28.8h, v18.8h\n" + "str q28, [x27, x16]\n" + "add x27, x27, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x17, x17, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n" + "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n" + "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "ldr q12, [x14, x28]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x14, x9]\n" + "fmla v30.8h, v2.8h, v13.8h\n" + "ldr q13, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v31.8h, v3.8h, v14.8h\n" + "ld1 { v14.8h }, [x12]\n" + "fmla v30.8h, v0.8h, v16.8h\n" + "fmla v31.8h, v4.8h, v15.8h\n" + "ld1 { v15.8h }, [x13]\n" + "fmla v30.8h, v4.8h, v11.8h\n" + "ldr q11, [x12, x8]\n" + "fmla v29.8h, v3.8h, v14.8h\n" + "ldr q14, [x12, x28]\n" + "fmla v31.8h, v2.8h, v16.8h\n" + "ldr q16, [x13, x8]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x13, x9]\n" + "fmla v29.8h, v0.8h, v15.8h\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "ldr q13, [x12, x9]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "ldr q11, [x13, x28]\n" + "add x13, x13, #0x10\n" + "fmla v31.8h, v6.8h, v15.8h\n" + "ld1 { v15.8h }, [x11]\n" + "fmla v30.8h, v7.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v16.8h\n" + "fmla v28.8h, v4.8h, v13.8h\n" + "ldr q13, [x11, x8]\n" + "fmla v31.8h, v7.8h, v16.8h\n" + "ldr q16, [x12, x10]\n" + "add x12, x12, #0x10\n" + "fmla v29.8h, v6.8h, v15.8h\n" + "ldr q15, [x11, x10]\n" + "fmla v30.8h, v8.8h, v11.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmax v31.8h, v31.8h, v19.8h\n" + "fmla v29.8h, v7.8h, v13.8h\n" + "fmax v30.8h, v30.8h, v19.8h\n" + "fmla v28.8h, v5.8h, v14.8h\n" + "ldr q14, [x11, x9]\n" + "fmin v31.8h, v31.8h, v18.8h\n" + "st1 { v31.8h }, [x15]\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v5.8h, v16.8h\n" + "ldr q11, [x11, x28]\n" + "add x11, x11, #0x10\n" + "fmin v30.8h, v30.8h, v18.8h\n" + "str q30, [x15, x16]\n" + "fmla v28.8h, v3.8h, v16.8h\n" + "add x15, x15, #0x10\n" + "fmla v29.8h, v8.8h, v15.8h\n" + "fmla v28.8h, v7.8h, v14.8h\n" + "fmax v29.8h, v29.8h, v19.8h\n" + "fmla v28.8h, v6.8h, v15.8h\n" + "fmin v29.8h, v29.8h, v18.8h\n" + "st1 { v29.8h }, [x27]\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "fmax v28.8h, v28.8h, v19.8h\n" + "fmin v28.8h, v28.8h, v18.8h\n" + "str q28, [x27, x16]\n" + "add x27, x27, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x1\n" + "beq 43f\n" + "ldr q17, [x7, #0x0]\n" + "ldr q0, [x7, #0x10]\n" + "add x26, x13, x10\n" + "ldr q1, [x7, #0x20]\n" + "add x25, x17, XZR\n" + "ldr q2, [x7, #0x30]\n" + "add x24, x17, x8\n" + "ldr q3, [x7, #0x40]\n" + "add x23, x17, x9\n" + "ldr q4, [x7, #0x50]\n" + "add x22, x17, x28\n" + "ldr q5, [x7, #0x60]\n" + "add x21, x14, XZR\n" + "ldr q6, [x7, #0x70]\n" + "add x20, x14, x8\n" + "ldr q7, [x7, #0x80]\n" + "add x19, x17, x10\n" + "ldr q8, [x7, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr s9, [x26], #0x4\n" + "ldr s10, [x25], #0x4\n" + "ldr s11, [x24], #0x4\n" + "ldr s12, [x23], #0x4\n" + "ldr s13, [x22], #0x4\n" + "ldr s14, [x21], #0x4\n" + "ldr s15, [x20], #0x4\n" + "ldr s16, [x19], #0x4\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.h }[2], [x26]\n" + "ld1 { v10.h }[2], [x25]\n" + "ld1 { v11.h }[2], [x24]\n" + "ld1 { v12.h }[2], [x23]\n" + "ld1 { v13.h }[2], [x22]\n" + "ld1 { v14.h }[2], [x21]\n" + "ld1 { v15.h }[2], [x20]\n" + "ld1 { v16.h }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset + "ldr h9, [x26, #0x0]\n" + "ldr h10, [x25, #0x0]\n" + "ldr h11, [x24, #0x0]\n" + "ldr h12, [x23, #0x0]\n" + "ldr h13, [x22, #0x0]\n" + "ldr h14, [x21, #0x0]\n" + "ldr h15, [x20, #0x0]\n" + "ldr h16, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End + "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "add x19, x14, x9\n" + "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n" + "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n" + "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "fmla v30.8h, v2.8h, v13.8h\n" + "fmla v31.8h, v3.8h, v14.8h\n" + "fmla v30.8h, v0.8h, v16.8h\n" + "fmla v31.8h, v4.8h, v15.8h\n" + "fmla v31.8h, v2.8h, v16.8h\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.8h, v4.8h, v11.8h\n" + "add x19, x14, x28\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v30.8h, v5.8h, v12.8h\n" + "add x19, x14, x10\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End + "fmla v31.8h, v5.8h, v13.8h\n" + "add x19, x12, XZR\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr s14, [x19], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v14.h }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr h14, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v29.8h, v3.8h, v14.8h\n" + "add x19, x13, XZR\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr s15, [x19], #0x4\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v15.h }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr h15, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v15.8h\n" + "add x19, x12, x8\n" + "fmla v29.8h, v0.8h, v15.8h\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v29.8h, v4.8h, v11.8h\n" + "add x19, x13, x8\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr s16, [x19], #0x4\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v16.h }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr h16, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v31.8h, v7.8h, v16.8h\n" + "add x19, x12, x9\n" + "fmla v29.8h, v1.8h, v16.8h\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v28.8h, v4.8h, v13.8h\n" + "add x19, x13, x9\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v30.8h, v7.8h, v12.8h\n" + "add x19, x12, x28\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr s14, [x19], #0x4\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v14.h }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr h14, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v28.8h, v5.8h, v14.8h\n" + "add x19, x11, XZR\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr s15, [x19], #0x4\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v15.h }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset + "ldr h15, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End + "fmla v29.8h, v6.8h, v15.8h\n" + "add x19, x13, x28\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v30.8h, v8.8h, v11.8h\n" + "add x19, x11, x8\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v29.8h, v7.8h, v13.8h\n" + "add x19, x12, x10\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr s16, [x19], #0x4\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v16.h }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr h16, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v29.8h, v5.8h, v16.8h\n" + "add x19, x11, x9\n" + "fmla v28.8h, v3.8h, v16.8h\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr s14, [x19], #0x4\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v14.h }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr h14, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v28.8h, v7.8h, v14.8h\n" + "add x19, x11, x10\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr s15, [x19], #0x4\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v15.h }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr h15, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v29.8h, v8.8h, v15.8h\n" + "add x19, x11, x28\n" + "fmla v28.8h, v6.8h, v15.8h\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v28.8h, v8.8h, v11.8h\n" + "fmax v31.8h, v31.8h, v19.8h\n" + "fmax v30.8h, v30.8h, v19.8h\n" + "fmax v29.8h, v29.8h, v19.8h\n" + "fmin v31.8h, v31.8h, v18.8h\n" + "fmin v30.8h, v30.8h, v18.8h\n" + "fmin v29.8h, v29.8h, v18.8h\n" + "fmax v28.8h, v28.8h, v19.8h\n" + "fmin v28.8h, v28.8h, v18.8h\n" + "tbz %x[n_channels], #1, 41f\n" + "mov x19, x15\n" + "st1 { v31.s }[0], [x19], x16\n" + "add x15, x15, #0x4\n" + "st1 { v30.s }[0], [x19]\n" + "mov x19, x27\n" + "st1 { v29.s }[0], [x19], x16\n" + "add x27, x27, #0x4\n" + "st1 { v28.s }[0], [x19]\n" + "tbz %x[n_channels], #0, 42f\n" + "mov x20, x15\n" + "st1 { v31.h }[2], [x20], x16\n" + "mov x19, x27\n" + "st1 { v30.h }[2], [x20]\n" + "st1 { v29.h }[2], [x19], x16\n" + "st1 { v28.h }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x20, x15\n" + "st1 { v31.h }[0], [x20], x16\n" + "mov x19, x27\n" + "st1 { v30.h }[0], [x20]\n" + "st1 { v29.h }[0], [x19], x16\n" + "st1 { v28.h }[0], [x19]\n" + "42:" // Tile loop: Oddments: Store: Bit 1: End + + "43:" // Tile loop: End + "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x6, #0x1\n" + "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x27, x27, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x27, x19\n" + "csel x27, x27, XZR, LT\n" + "csel x6, x6, x21, LT\n" + "cmp x6, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..f071e21979 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[25]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[1]; + inptrs[3] = input_ptrs[3]; + inptrs[4] = input_ptrs[4]; + inptrs[5] = input_ptrs[5]; + inptrs[6] = input_ptrs[6]; + inptrs[7] = input_ptrs[2]; + inptrs[8] = input_ptrs[8]; + inptrs[9] = input_ptrs[9]; + inptrs[10] = input_ptrs[7]; + inptrs[11] = input_ptrs[15]; + inptrs[12] = input_ptrs[10]; + inptrs[13] = input_ptrs[16]; + inptrs[14] = input_ptrs[11]; + inptrs[15] = input_ptrs[18]; + inptrs[16] = input_ptrs[13]; + inptrs[17] = input_ptrs[19]; + inptrs[18] = input_ptrs[20]; + inptrs[19] = input_ptrs[14]; + inptrs[20] = input_ptrs[21]; + inptrs[21] = input_ptrs[17]; + inptrs[22] = input_ptrs[23]; + inptrs[23] = input_ptrs[22]; + inptrs[24] = input_ptrs[24]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v19.8h }, [x20]\n" + "ld1r { v18.8h }, [x19]\n" + "mov x14, #0x0\n" + "ldp x13, x12, [x21, #0x0]\n" + "mov x11, #0x10\n" // cntb _, ALL, #1 + "ldp x10, x9, [x21, #0x10]\n" + "sub x28, XZR, x11\n" + "lsr x27, %x[n_channels], #0x3\n" + "cbz x27, 3f\n" + "ldr q17, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x11, x27, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldr q9, [x26, x14]\n" + "ldr q10, [x25, x14]\n" + "ldr q11, [x24, x14]\n" + "ldr q12, [x23, x14]\n" + "ldr q13, [x22, x14]\n" + "ldr q14, [x21, x14]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ldr q15, [x20, x14]\n" + "ldr q16, [x19, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x26, [x16, #0x40]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n" + "ldr x25, [x16, #0x48]\n" + "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n" + "ldr x24, [x16, #0x50]\n" + "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ldr x23, [x16, #0x58]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "ldr q12, [x25, x14]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x26, x14]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v30.8h, v2.8h, v13.8h\n" + "ldr q13, [x24, x14]\n" + "fmla v31.8h, v3.8h, v14.8h\n" + "ldr q14, [x23, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v30.8h, v0.8h, v16.8h\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.8h, v4.8h, v15.8h\n" + "ldr q15, [x22, x14]\n" + "fmla v29.8h, v3.8h, v14.8h\n" + "ldr x25, [x16, #0x88]\n" + "fmla v30.8h, v4.8h, v11.8h\n" + "ldr q11, [x21, x14]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.8h, v2.8h, v16.8h\n" + "ldr q16, [x20, x14]\n" + "fmla v29.8h, v0.8h, v15.8h\n" + "ldr q14, [x25, x14]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x26, x14]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "ldr q11, [x23, x14]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "ldr q13, [x19, x14]\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v31.8h, v6.8h, v15.8h\n" + "ldr q15, [x24, x14]\n" + "fmla v29.8h, v1.8h, v16.8h\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v30.8h, v7.8h, v12.8h\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.8h, v4.8h, v13.8h\n" + "ldr q13, [x22, x14]\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v31.8h, v7.8h, v16.8h\n" + "fmla v29.8h, v6.8h, v15.8h\n" + "ldr q16, [x21, x14]\n" + "fmla v30.8h, v8.8h, v11.8h\n" + "ldr q15, [x19, x14]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q17, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "fmla v29.8h, v7.8h, v13.8h\n" + "fmax v31.8h, v31.8h, v19.8h\n" + "ldr q1, [x15, #0x20]\n" + "fmax v30.8h, v30.8h, v19.8h\n" + "ldr q4, [x15, #0x50]\n" + "fmla v28.8h, v5.8h, v14.8h\n" + "ldr q14, [x20, x14]\n" + "fmin v31.8h, v31.8h, v18.8h\n" + "str q31, [x13, x28]\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v5.8h, v16.8h\n" + "ldr q11, [x26, x14]\n" + "add x14, x14, #0x10\n" + "fmin v30.8h, v30.8h, v18.8h\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "fmla v28.8h, v3.8h, v16.8h\n" + "ldp x22, x21, [x16, #0x20]\n" + "fmla v29.8h, v8.8h, v15.8h\n" + "ldr q9, [x26, x11]\n" + "ldr q10, [x25, x11]\n" + "fmla v28.8h, v7.8h, v14.8h\n" + "ldr q12, [x23, x11]\n" + "fmax v29.8h, v29.8h, v19.8h\n" + "ldr q13, [x22, x11]\n" + "ldr q14, [x21, x11]\n" + "fmin v29.8h, v29.8h, v18.8h\n" + "ldp x20, x19, [x16, #0x30]\n" + "str q30, [x12, x28]\n" + "fmla v28.8h, v6.8h, v15.8h\n" + "ldr q2, [x15, #0x30]\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "ldr q11, [x24, x11]\n" + "ldr q15, [x20, x11]\n" + "fmax v28.8h, v28.8h, v19.8h\n" + "ldr q16, [x19, x11]\n" + "add x11, x11, #0x10\n" + "fmin v28.8h, v28.8h, v18.8h\n" + "str q29, [x10, x28]\n" + "cmp x11, x27, LSL #4\n" + "ldr q3, [x15, #0x40]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "str q28, [x9, x28]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x26, [x16, #0x40]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n" + "ldr x25, [x16, #0x48]\n" + "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n" + "ldr x24, [x16, #0x50]\n" + "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "ldr x23, [x16, #0x58]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "ldr q12, [x25, x14]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x26, x14]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v30.8h, v2.8h, v13.8h\n" + "ldr q13, [x24, x14]\n" + "fmla v31.8h, v3.8h, v14.8h\n" + "ldr q14, [x23, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v30.8h, v0.8h, v16.8h\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.8h, v4.8h, v15.8h\n" + "ldr q15, [x22, x14]\n" + "fmla v29.8h, v3.8h, v14.8h\n" + "ldr x25, [x16, #0x88]\n" + "fmla v30.8h, v4.8h, v11.8h\n" + "ldr q11, [x21, x14]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.8h, v2.8h, v16.8h\n" + "ldr q16, [x20, x14]\n" + "fmla v29.8h, v0.8h, v15.8h\n" + "ldr q14, [x25, x14]\n" + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr q12, [x26, x14]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.8h, v5.8h, v13.8h\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v29.8h, v4.8h, v11.8h\n" + "ldr q11, [x23, x14]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "ldr q13, [x19, x14]\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v31.8h, v6.8h, v15.8h\n" + "ldr q15, [x24, x14]\n" + "fmla v29.8h, v1.8h, v16.8h\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v30.8h, v7.8h, v12.8h\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.8h, v4.8h, v13.8h\n" + "ldr q13, [x22, x14]\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v31.8h, v7.8h, v16.8h\n" + "fmla v29.8h, v6.8h, v15.8h\n" + "ldr q16, [x21, x14]\n" + "fmla v30.8h, v8.8h, v11.8h\n" + "ldr q15, [x19, x14]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v7.8h, v13.8h\n" + "fmax v31.8h, v31.8h, v19.8h\n" + "fmax v30.8h, v30.8h, v19.8h\n" + "fmla v28.8h, v5.8h, v14.8h\n" + "ldr q14, [x20, x14]\n" + "fmin v31.8h, v31.8h, v18.8h\n" + "str q31, [x13, x28]\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v5.8h, v16.8h\n" + "ldr q11, [x26, x14]\n" + "add x14, x14, #0x10\n" + "fmin v30.8h, v30.8h, v18.8h\n" + "str q30, [x12, x28]\n" + "fmla v28.8h, v3.8h, v16.8h\n" + "fmla v29.8h, v8.8h, v15.8h\n" + "fmla v28.8h, v7.8h, v14.8h\n" + "fmax v29.8h, v29.8h, v19.8h\n" + "fmin v29.8h, v29.8h, v18.8h\n" + "str q29, [x10, x28]\n" + "fmla v28.8h, v6.8h, v15.8h\n" + "fmla v28.8h, v8.8h, v11.8h\n" + "fmax v28.8h, v28.8h, v19.8h\n" + "fmin v28.8h, v28.8h, v18.8h\n" + "str q28, [x9, x28]\n" + "3:" // Oddments + "tst %x[n_channels], #0x1\n" + "beq 42f\n" + "ldr q17, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x28, x14\n" + "ldr q1, [x15, #0x20]\n" + "add x13, x13, x28\n" + "ldr q2, [x15, #0x30]\n" + "add x12, x12, x28\n" + "ldr q3, [x15, #0x40]\n" + "add x10, x10, x28\n" + "ldr q4, [x15, #0x50]\n" + "add x9, x9, x28\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x26, [x16, #0x0]\n" + "ldr x25, [x16, #0x8]\n" + "ldr x24, [x16, #0x10]\n" + "add x26, x26, x14\n" + "ldr x23, [x16, #0x18]\n" + "add x25, x25, x14\n" + "ldr x22, [x16, #0x20]\n" + "add x24, x24, x14\n" + "ldr x21, [x16, #0x28]\n" + "add x23, x23, x14\n" + "ldr x20, [x16, #0x30]\n" + "add x22, x22, x14\n" + "ldr x19, [x16, #0x38]\n" + "add x21, x21, x14\n" + "add x20, x20, x14\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.s }[0], [x26], #0x4\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "ld1 { v13.s }[0], [x22], #0x4\n" + "ld1 { v14.s }[0], [x21], #0x4\n" + "ld1 { v15.s }[0], [x20], #0x4\n" + "ld1 { v16.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.h }[2], [x26], #0x2\n" + "ld1 { v10.h }[2], [x25], #0x2\n" + "ld1 { v11.h }[2], [x24], #0x2\n" + "ld1 { v12.h }[2], [x23], #0x2\n" + "ld1 { v13.h }[2], [x22], #0x2\n" + "ld1 { v14.h }[2], [x21], #0x2\n" + "ld1 { v15.h }[2], [x20], #0x2\n" + "ld1 { v16.h }[2], [x19], #0x2\n" + "b 5f\n" + "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset + "ld1 { v9.h }[0], [x26], #0x2\n" + "ld1 { v10.h }[0], [x25], #0x2\n" + "ld1 { v11.h }[0], [x24], #0x2\n" + "ld1 { v12.h }[0], [x23], #0x2\n" + "ld1 { v13.h }[0], [x22], #0x2\n" + "ld1 { v14.h }[0], [x21], #0x2\n" + "ld1 { v15.h }[0], [x20], #0x2\n" + "ld1 { v16.h }[0], [x19], #0x2\n" + "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End + "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n" + "ldr x26, [x16, #0x40]\n" + "add x26, x26, x14\n" + "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n" + "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n" + "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n" + "fmla v31.8h, v0.8h, v10.8h\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "fmla v30.8h, v2.8h, v13.8h\n" + "fmla v31.8h, v3.8h, v14.8h\n" + "fmla v30.8h, v0.8h, v16.8h\n" + "fmla v31.8h, v4.8h, v15.8h\n" + "fmla v31.8h, v2.8h, v16.8h\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v11.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v11.h }[2], [x26], #0x2\n" + "b 7f\n" + "6:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v11.h }[0], [x26], #0x2\n" + "7:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.8h, v4.8h, v11.8h\n" + "ldr x25, [x16, #0x48]\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v12.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v12.h }[2], [x25], #0x2\n" + "b 9f\n" + "8:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v12.h }[0], [x25], #0x2\n" + "9:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v30.8h, v5.8h, v12.8h\n" + "ldr x24, [x16, #0x50]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v13.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v13.h }[2], [x24], #0x2\n" + "b 11f\n" + "10:" // Oddments: Load input (1, 2): Bit 1: Unset + "ld1 { v13.h }[0], [x24], #0x2\n" + "11:" // Oddments: Load input (1, 2): Bit 1: End + "fmla v31.8h, v5.8h, v13.8h\n" + "ldr x23, [x16, #0x58]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v14.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v14.h }[2], [x23], #0x2\n" + "b 13f\n" + "12:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v14.h }[0], [x23], #0x2\n" + "13:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v29.8h, v3.8h, v14.8h\n" + "ldr x22, [x16, #0x60]\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v15.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v15.h }[2], [x22], #0x2\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v15.h }[0], [x22], #0x2\n" + "15:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.8h, v6.8h, v15.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v29.8h, v0.8h, v15.8h\n" + "add x21, x21, x14\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v11.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v11.h }[2], [x21], #0x2\n" + "b 17f\n" + "16:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v11.h }[0], [x21], #0x2\n" + "17:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v29.8h, v4.8h, v11.8h\n" + "ldr x20, [x16, #0x70]\n" + "add x20, x20, x14\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v16.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v16.h }[2], [x20], #0x2\n" + "b 19f\n" + "18:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v16.h }[0], [x20], #0x2\n" + "19:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v31.8h, v7.8h, v16.8h\n" + "ldr x19, [x16, #0x78]\n" + "fmla v29.8h, v1.8h, v16.8h\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v13.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v13.h }[2], [x19], #0x2\n" + "b 21f\n" + "20:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v13.h }[0], [x19], #0x2\n" + "21:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v28.8h, v4.8h, v13.8h\n" + "ldr x26, [x16, #0x80]\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v12.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v12.h }[2], [x26], #0x2\n" + "b 23f\n" + "22:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v12.h }[0], [x26], #0x2\n" + "23:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v30.8h, v7.8h, v12.8h\n" + "ldr x25, [x16, #0x88]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v14.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v14.h }[2], [x25], #0x2\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v14.h }[0], [x25], #0x2\n" + "25:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v28.8h, v5.8h, v14.8h\n" + "ldr x24, [x16, #0x90]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v15.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v15.h }[2], [x24], #0x2\n" + "b 27f\n" + "26:" // Oddments: Load input (4, 0): Bit 1: Unset + "ld1 { v15.h }[0], [x24], #0x2\n" + "27:" // Oddments: Load input (4, 0): Bit 1: End + "fmla v29.8h, v6.8h, v15.8h\n" + "ldr x23, [x16, #0x98]\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v11.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v11.h }[2], [x23], #0x2\n" + "b 29f\n" + "28:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x23], #0x2\n" + "29:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v30.8h, v8.8h, v11.8h\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v13.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v13.h }[2], [x22], #0x2\n" + "b 31f\n" + "30:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v13.h }[0], [x22], #0x2\n" + "31:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v29.8h, v7.8h, v13.8h\n" + "ldr x21, [x16, #0xa8]\n" + "add x21, x21, x14\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v16.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v16.h }[2], [x21], #0x2\n" + "b 33f\n" + "32:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v16.h }[0], [x21], #0x2\n" + "33:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v29.8h, v5.8h, v16.8h\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v28.8h, v3.8h, v16.8h\n" + "add x20, x20, x14\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v14.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v14.h }[2], [x20], #0x2\n" + "b 35f\n" + "34:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v14.h }[0], [x20], #0x2\n" + "35:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v28.8h, v7.8h, v14.8h\n" + "ldr x19, [x16, #0xb8]\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v15.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v15.h }[2], [x19], #0x2\n" + "b 37f\n" + "36:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v15.h }[0], [x19], #0x2\n" + "37:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v29.8h, v8.8h, v15.8h\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v28.8h, v6.8h, v15.8h\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v11.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v11.h }[2], [x26], #0x2\n" + "b 39f\n" + "38:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v11.h }[0], [x26], #0x2\n" + "39:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v28.8h, v8.8h, v11.8h\n" + "fmax v31.8h, v31.8h, v19.8h\n" + "fmax v30.8h, v30.8h, v19.8h\n" + "fmax v29.8h, v29.8h, v19.8h\n" + "fmin v31.8h, v31.8h, v18.8h\n" + "fmin v30.8h, v30.8h, v18.8h\n" + "fmin v29.8h, v29.8h, v18.8h\n" + "fmax v28.8h, v28.8h, v19.8h\n" + "fmin v28.8h, v28.8h, v18.8h\n" + "tbz %x[n_channels], #1, 40f\n" + "st1 { v31.s }[0], [x13], #0x4\n" + "st1 { v30.s }[0], [x12], #0x4\n" + "st1 { v29.s }[0], [x10], #0x4\n" + "st1 { v28.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 41f\n" + "st1 { v31.h }[2], [x13], #0x2\n" + "st1 { v30.h }[2], [x12], #0x2\n" + "st1 { v29.h }[2], [x10], #0x2\n" + "st1 { v28.h }[2], [x9], #0x2\n" + "b 41f\n" + "40:" // Oddments: Store: Bit 1: Unset + "st1 { v31.h }[0], [x13], #0x2\n" + "st1 { v30.h }[0], [x12], #0x2\n" + "st1 { v29.h }[0], [x10], #0x2\n" + "st1 { v28.h }[0], [x9], #0x2\n" + "41:" // Oddments: Store: Bit 1: End + + "42:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..53d2a3a8e1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + + a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..ec5f97ab6d --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x28, #0x0\n" + "mov x27, #0x0\n" + "1:" // Tile loop + "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x26, #0x2\n" + "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x25, #0x2\n" + "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x22, #0x0\n" + "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x28, x23\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col + "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x26\n" // offset *= kernel_stride * output_size + "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x5, x5, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1r { v18.8h }, [x24]\n" + "add x8, x5, x23, LSL #1\n" + "ld1r { v17.8h }, [x21]\n" + "add x17, x8, x23, LSL #1\n" + "lsl x4, x4, #0x1\n" + "add x16, x17, x23, LSL #1\n" + "add x15, x16, x23, LSL #1\n" + "add x14, x15, x23, LSL #1\n" + "add x13, x4, x4\n" + "add x12, x13, x4\n" + "add x11, x12, x4\n" + "add x10, x11, x4\n" + "mul x19, x28, x20\n" // offset = tile_i * ld_output_row + "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x25\n" // offset *= output_tile_size + "add x7, x7, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x9, x7, x20, LSL #1\n" + "lsl x6, x6, #0x1\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x3\n" + "cbz x19, 4f\n" + "ldr q16, [x3, #0x0]\n" + "ldr q0, [x3, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x3, #0x20]\n" + "ldr q2, [x3, #0x30]\n" + "ldr q3, [x3, #0x40]\n" + "ldr q4, [x3, #0x50]\n" + "add x3, x3, #0x60\n" + "ld1 { v5.8h }, [x5]\n" + "ldr q6, [x5, x4]\n" + "ld1 { v7.8h }, [x8]\n" + "ldr q8, [x8, x4]\n" + "ldr q9, [x5, x13]\n" + "ldr q13, [x8, x13]\n" + "ldr q11, [x5, x12]\n" + "ldr q12, [x5, x11]\n" + "ldr q10, [x8, x10]\n" + "ld1 { v14.8h }, [x17]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n" + "ldr q5, [x8, x12]\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n" + "add x22, x22, #0x10\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n" + "add x21, x21, #0x10\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n" + "ldr q0, [x3, #0x0]\n" + "cmp x21, x19, LSL #4\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x8, x11]\n" + "add x8, x8, #0x10\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "ldr q16, [x3, #0x140]\n" + "fmla v29.8h, v1.8h, v8.8h\n" + "fmla v28.8h, v1.8h, v13.8h\n" + "ldr q1, [x3, #0x10]\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "ldr q9, [x5, x10]\n" + "add x5, x5, #0x10\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v13.8h\n" + "fmla v28.8h, v2.8h, v5.8h\n" + "ldr q2, [x3, #0x20]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x17, x4]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v5.8h\n" + "fmla v28.8h, v3.8h, v6.8h\n" + "ldr q3, [x3, #0x30]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x17, x13]\n" + "fmla v30.8h, v4.8h, v9.8h\n" + "ldr q9, [x17, x12]\n" + "fmla v29.8h, v4.8h, v6.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x3, #0x40]\n" + "fmla v31.8h, v0.8h, v7.8h\n" + "ld1 { v7.8h }, [x8]\n" + "fmla v30.8h, v0.8h, v8.8h\n" + "fmla v29.8h, v0.8h, v14.8h\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q0, [x3, #0x50]\n" + "fmla v31.8h, v1.8h, v8.8h\n" + "ldr q8, [x17, x10]\n" + "fmla v30.8h, v1.8h, v13.8h\n" + "fmla v29.8h, v1.8h, v11.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q1, [x3, #0x60]\n" + "fmla v31.8h, v2.8h, v13.8h\n" + "ldr q13, [x17, x11]\n" + "add x17, x17, #0x10\n" + "fmla v30.8h, v2.8h, v5.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "fmla v28.8h, v2.8h, v9.8h\n" + "ldr q2, [x3, #0x70]\n" + "fmla v31.8h, v3.8h, v5.8h\n" + "ld1 { v5.8h }, [x16]\n" + "fmla v30.8h, v3.8h, v6.8h\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr q3, [x3, #0x80]\n" + "fmla v31.8h, v4.8h, v6.8h\n" + "ldr q6, [x16, x4]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "ldr q10, [x16, x13]\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "fmla v28.8h, v4.8h, v8.8h\n" + "ldr q4, [x3, #0x90]\n" + "fmla v31.8h, v0.8h, v14.8h\n" + "ldr q14, [x16, x10]\n" + "fmla v30.8h, v0.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v5.8h\n" + "fmla v28.8h, v0.8h, v6.8h\n" + "ldr q0, [x3, #0xa0]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x16, x12]\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v6.8h\n" + "fmla v28.8h, v1.8h, v10.8h\n" + "ldr q1, [x3, #0xb0]\n" + "fmla v31.8h, v2.8h, v12.8h\n" + "ldr q12, [x16, x11]\n" + "add x16, x16, #0x10\n" + "fmla v30.8h, v2.8h, v9.8h\n" + "fmla v29.8h, v2.8h, v10.8h\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr q2, [x3, #0xc0]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ld1 { v9.8h }, [x15]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr q3, [x3, #0xd0]\n" + "fmla v31.8h, v4.8h, v13.8h\n" + "ldr q13, [x15, x4]\n" + "fmla v30.8h, v4.8h, v8.8h\n" + "ldr q8, [x15, x11]\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v4.8h, v14.8h\n" + "ldr q4, [x3, #0xe0]\n" + "fmla v31.8h, v0.8h, v5.8h\n" + "ldr q5, [x15, x13]\n" + "fmla v30.8h, v0.8h, v6.8h\n" + "fmla v29.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v0.8h, v13.8h\n" + "ldr q0, [x3, #0xf0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x15, x12]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v1.8h, v13.8h\n" + "fmla v28.8h, v1.8h, v5.8h\n" + "ldr q1, [x3, #0x100]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr q10, [x15, x10]\n" + "add x15, x15, #0x10\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v5.8h\n" + "fmla v28.8h, v2.8h, v6.8h\n" + "ldr q2, [x3, #0x110]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ld1 { v11.8h }, [x14]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v6.8h\n" + "fmla v28.8h, v3.8h, v8.8h\n" + "ldr q3, [x3, #0x120]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x14, x4]\n" + "fmla v30.8h, v4.8h, v14.8h\n" + "ld1 { v14.8h }, [x17]\n" + "fmla v29.8h, v4.8h, v8.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x3, #0x130]\n" + "fmla v31.8h, v0.8h, v9.8h\n" + "ldr q9, [x14, x13]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x14, x12]\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr q0, [x3, #0x150]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "ldr q13, [x8, x13]\n" + "fmla v30.8h, v1.8h, v5.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "ldr q12, [x14, x11]\n" + "fmla v28.8h, v1.8h, v9.8h\n" + "ldr q1, [x3, #0x160]\n" + "fmla v31.8h, v2.8h, v5.8h\n" + "ld1 { v5.8h }, [x5]\n" + "fmla v30.8h, v2.8h, v6.8h\n" + "fmla v29.8h, v2.8h, v9.8h\n" + "ldr q9, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr q2, [x3, #0x170]\n" + "fmla v31.8h, v3.8h, v6.8h\n" + "ldr q6, [x5, x4]\n" + "fmla v30.8h, v3.8h, v8.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "ldr q11, [x5, x12]\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr q3, [x3, #0x180]\n" + "fmla v31.8h, v4.8h, v8.8h\n" + "ldr q8, [x8, x4]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "ldr q10, [x8, x10]\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "ldr q12, [x5, x11]\n" + "fmla v28.8h, v4.8h, v9.8h\n" + "ldr q9, [x5, x13]\n" + "ldr q4, [x3, #0x190]\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "add x3, x3, #0x1a0\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "st1 { v31.8h }, [x7]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "str q30, [x7, x6]\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "add x7, x7, #0x10\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "st1 { v29.8h }, [x9]\n" + "str q28, [x9, x6]\n" + "add x9, x9, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n" + "ldr q5, [x8, x12]\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n" + "ldr q0, [x3, #0x0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x8, x11]\n" + "add x8, x8, #0x10\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "fmla v29.8h, v1.8h, v8.8h\n" + "fmla v28.8h, v1.8h, v13.8h\n" + "ldr q1, [x3, #0x10]\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "ldr q9, [x5, x10]\n" + "add x5, x5, #0x10\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v13.8h\n" + "fmla v28.8h, v2.8h, v5.8h\n" + "ldr q2, [x3, #0x20]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x17, x4]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v5.8h\n" + "fmla v28.8h, v3.8h, v6.8h\n" + "ldr q3, [x3, #0x30]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x17, x13]\n" + "fmla v30.8h, v4.8h, v9.8h\n" + "ldr q9, [x17, x12]\n" + "fmla v29.8h, v4.8h, v6.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x3, #0x40]\n" + "fmla v31.8h, v0.8h, v7.8h\n" + "fmla v30.8h, v0.8h, v8.8h\n" + "fmla v29.8h, v0.8h, v14.8h\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q0, [x3, #0x50]\n" + "fmla v31.8h, v1.8h, v8.8h\n" + "ldr q8, [x17, x10]\n" + "fmla v30.8h, v1.8h, v13.8h\n" + "fmla v29.8h, v1.8h, v11.8h\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q1, [x3, #0x60]\n" + "fmla v31.8h, v2.8h, v13.8h\n" + "ldr q13, [x17, x11]\n" + "add x17, x17, #0x10\n" + "fmla v30.8h, v2.8h, v5.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "fmla v28.8h, v2.8h, v9.8h\n" + "ldr q2, [x3, #0x70]\n" + "fmla v31.8h, v3.8h, v5.8h\n" + "ld1 { v5.8h }, [x16]\n" + "fmla v30.8h, v3.8h, v6.8h\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr q3, [x3, #0x80]\n" + "fmla v31.8h, v4.8h, v6.8h\n" + "ldr q6, [x16, x4]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "ldr q10, [x16, x13]\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "fmla v28.8h, v4.8h, v8.8h\n" + "ldr q4, [x3, #0x90]\n" + "fmla v31.8h, v0.8h, v14.8h\n" + "ldr q14, [x16, x10]\n" + "fmla v30.8h, v0.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v5.8h\n" + "fmla v28.8h, v0.8h, v6.8h\n" + "ldr q0, [x3, #0xa0]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x16, x12]\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v6.8h\n" + "fmla v28.8h, v1.8h, v10.8h\n" + "ldr q1, [x3, #0xb0]\n" + "fmla v31.8h, v2.8h, v12.8h\n" + "ldr q12, [x16, x11]\n" + "add x16, x16, #0x10\n" + "fmla v30.8h, v2.8h, v9.8h\n" + "fmla v29.8h, v2.8h, v10.8h\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr q2, [x3, #0xc0]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ld1 { v9.8h }, [x15]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr q3, [x3, #0xd0]\n" + "fmla v31.8h, v4.8h, v13.8h\n" + "ldr q13, [x15, x4]\n" + "fmla v30.8h, v4.8h, v8.8h\n" + "ldr q8, [x15, x11]\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v4.8h, v14.8h\n" + "ldr q4, [x3, #0xe0]\n" + "fmla v31.8h, v0.8h, v5.8h\n" + "ldr q5, [x15, x13]\n" + "fmla v30.8h, v0.8h, v6.8h\n" + "fmla v29.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v0.8h, v13.8h\n" + "ldr q0, [x3, #0xf0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x15, x12]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v1.8h, v13.8h\n" + "fmla v28.8h, v1.8h, v5.8h\n" + "ldr q1, [x3, #0x100]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr q10, [x15, x10]\n" + "add x15, x15, #0x10\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v5.8h\n" + "fmla v28.8h, v2.8h, v6.8h\n" + "ldr q2, [x3, #0x110]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ld1 { v11.8h }, [x14]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v6.8h\n" + "fmla v28.8h, v3.8h, v8.8h\n" + "ldr q3, [x3, #0x120]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x14, x4]\n" + "fmla v30.8h, v4.8h, v14.8h\n" + "fmla v29.8h, v4.8h, v8.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x3, #0x130]\n" + "add x3, x3, #0x140\n" + "fmla v31.8h, v0.8h, v9.8h\n" + "ldr q9, [x14, x13]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x14, x12]\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "fmla v30.8h, v1.8h, v5.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "ldr q12, [x14, x11]\n" + "fmla v28.8h, v1.8h, v9.8h\n" + "fmla v31.8h, v2.8h, v5.8h\n" + "fmla v30.8h, v2.8h, v6.8h\n" + "fmla v29.8h, v2.8h, v9.8h\n" + "ldr q9, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "fmla v31.8h, v3.8h, v6.8h\n" + "fmla v30.8h, v3.8h, v8.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v31.8h, v4.8h, v8.8h\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v4.8h, v9.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "st1 { v31.8h }, [x7]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "str q30, [x7, x6]\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "add x7, x7, #0x10\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "st1 { v29.8h }, [x9]\n" + "str q28, [x9, x6]\n" + "add x9, x9, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x1\n" + "beq 61f\n" + "ldr q16, [x3, #0x0]\n" + "ldr q0, [x3, #0x10]\n" + "add x28, x5, XZR\n" + "ldr q1, [x3, #0x20]\n" + "add x27, x5, x4\n" + "ldr q2, [x3, #0x30]\n" + "add x26, x8, XZR\n" + "ldr q3, [x3, #0x40]\n" + "add x25, x8, x4\n" + "ldr q4, [x3, #0x50]\n" + "add x24, x5, x13\n" + "add x23, x8, x13\n" + "add x22, x5, x12\n" + "add x21, x5, x11\n" + "add x20, x8, x10\n" + "add x19, x17, XZR\n" + "add x3, x3, #0x60\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr s5, [x28], #0x4\n" + "ldr s6, [x27], #0x4\n" + "ldr s7, [x26], #0x4\n" + "ldr s8, [x25], #0x4\n" + "ldr s9, [x24], #0x4\n" + "ldr s13, [x23], #0x4\n" + "ldr s11, [x22], #0x4\n" + "ldr s12, [x21], #0x4\n" + "ldr s10, [x20], #0x4\n" + "ldr s14, [x19], #0x4\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v5.h }[2], [x28]\n" + "ld1 { v6.h }[2], [x27]\n" + "ld1 { v7.h }[2], [x26]\n" + "ld1 { v8.h }[2], [x25]\n" + "ld1 { v9.h }[2], [x24]\n" + "ld1 { v13.h }[2], [x23]\n" + "ld1 { v11.h }[2], [x22]\n" + "ld1 { v12.h }[2], [x21]\n" + "ld1 { v10.h }[2], [x20]\n" + "ld1 { v14.h }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset + "ldr h5, [x28, #0x0]\n" + "ldr h6, [x27, #0x0]\n" + "ldr h7, [x26, #0x0]\n" + "ldr h8, [x25, #0x0]\n" + "ldr h9, [x24, #0x0]\n" + "ldr h13, [x23, #0x0]\n" + "ldr h11, [x22, #0x0]\n" + "ldr h12, [x21, #0x0]\n" + "ldr h10, [x20, #0x0]\n" + "ldr h14, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n" + "add x19, x8, x12\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "fmla v29.8h, v1.8h, v8.8h\n" + "fmla v28.8h, v1.8h, v13.8h\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v13.8h\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr s5, [x19], #0x4\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v5.h }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr h5, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v28.8h, v2.8h, v5.8h\n" + "add x19, x8, x11\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v5.8h\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr s6, [x19], #0x4\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v6.h }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr h6, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v28.8h, v3.8h, v6.8h\n" + "add x19, x5, x10\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End + "fmla v30.8h, v4.8h, v9.8h\n" + "ldr h0, [x3, #0xc]\n" + "add x19, x17, x4\n" + "fmla v29.8h, v4.8h, v6.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "fmla v31.8h, v0.8h, v7.8h\n" + "fmla v30.8h, v0.8h, v8.8h\n" + "fmla v29.8h, v0.8h, v14.8h\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr h1, [x3, #0xe]\n" + "add x19, x17, x13\n" + "fmla v31.8h, v1.8h, v8.8h\n" + "fmla v30.8h, v1.8h, v13.8h\n" + "fmla v29.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr h2, [x3, #0x10]\n" + "add x19, x17, x12\n" + "fmla v31.8h, v2.8h, v13.8h\n" + "fmla v30.8h, v2.8h, v5.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v28.8h, v2.8h, v9.8h\n" + "ldr h3, [x3, #0x12]\n" + "add x19, x17, x11\n" + "fmla v31.8h, v3.8h, v5.8h\n" + "fmla v30.8h, v3.8h, v6.8h\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr h4, [x3, #0x14]\n" + "add x19, x17, x10\n" + "fmla v31.8h, v4.8h, v6.8h\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr s8, [x19], #0x4\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v8.h }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset + "ldr h8, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End + "fmla v28.8h, v4.8h, v8.8h\n" + "ldr h0, [x3, #0x16]\n" + "add x19, x16, XZR\n" + "fmla v31.8h, v0.8h, v14.8h\n" + "fmla v30.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr s5, [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v5.h }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr h5, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v29.8h, v0.8h, v5.8h\n" + "add x19, x16, x4\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr s6, [x19], #0x4\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v6.h }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr h6, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v28.8h, v0.8h, v6.8h\n" + "ldr h1, [x3, #0x18]\n" + "add x19, x16, x13\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v6.8h\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v28.8h, v1.8h, v10.8h\n" + "ldr h2, [x3, #0x1a]\n" + "add x19, x16, x12\n" + "fmla v31.8h, v2.8h, v12.8h\n" + "fmla v30.8h, v2.8h, v9.8h\n" + "fmla v29.8h, v2.8h, v10.8h\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr h3, [x3, #0x1c]\n" + "add x19, x16, x11\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr h4, [x3, #0x1e]\n" + "add x19, x16, x10\n" + "fmla v31.8h, v4.8h, v13.8h\n" + "fmla v30.8h, v4.8h, v8.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr s14, [x19], #0x4\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v14.h }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset + "ldr h14, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End + "fmla v28.8h, v4.8h, v14.8h\n" + "ldr h0, [x3, #0x20]\n" + "add x19, x15, XZR\n" + "fmla v31.8h, v0.8h, v5.8h\n" + "fmla v30.8h, v0.8h, v6.8h\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End + "fmla v29.8h, v0.8h, v9.8h\n" + "add x19, x15, x4\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr s13, [x19], #0x4\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v13.h }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr h13, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v28.8h, v0.8h, v13.8h\n" + "ldr h1, [x3, #0x22]\n" + "add x19, x15, x13\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v1.8h, v13.8h\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr s5, [x19], #0x4\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v5.h }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr h5, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v28.8h, v1.8h, v5.8h\n" + "ldr h2, [x3, #0x24]\n" + "add x19, x15, x12\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v5.8h\n" + "tbz %x[n_channels], #1, 41f\n" + "ldr s6, [x19], #0x4\n" + "tbz %x[n_channels], #0, 42f\n" + "ld1 { v6.h }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr h6, [x19, #0x0]\n" + "42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v28.8h, v2.8h, v6.8h\n" + "ldr h3, [x3, #0x26]\n" + "add x19, x15, x11\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v6.8h\n" + "tbz %x[n_channels], #1, 43f\n" + "ldr s8, [x19], #0x4\n" + "tbz %x[n_channels], #0, 44f\n" + "ld1 { v8.h }[2], [x19]\n" + "b 44f\n" + "43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr h8, [x19, #0x0]\n" + "44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v28.8h, v3.8h, v8.8h\n" + "ldr h4, [x3, #0x28]\n" + "add x19, x15, x10\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "fmla v30.8h, v4.8h, v14.8h\n" + "fmla v29.8h, v4.8h, v8.8h\n" + "tbz %x[n_channels], #1, 45f\n" + "ldr s10, [x19], #0x4\n" + "tbz %x[n_channels], #0, 46f\n" + "ld1 { v10.h }[2], [x19]\n" + "b 46f\n" + "45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset + "ldr h10, [x19, #0x0]\n" + "46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr h0, [x3, #0x2a]\n" + "add x19, x14, XZR\n" + "fmla v31.8h, v0.8h, v9.8h\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "tbz %x[n_channels], #1, 47f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 48f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 48f\n" + "47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End + "fmla v29.8h, v0.8h, v11.8h\n" + "add x19, x14, x4\n" + "tbz %x[n_channels], #1, 49f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 50f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 50f\n" + "49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr h1, [x3, #0x2c]\n" + "add x19, x14, x13\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "fmla v30.8h, v1.8h, v5.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "tbz %x[n_channels], #1, 51f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 52f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 52f\n" + "51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End + "fmla v28.8h, v1.8h, v9.8h\n" + "ldr h2, [x3, #0x2e]\n" + "add x19, x14, x12\n" + "fmla v31.8h, v2.8h, v5.8h\n" + "fmla v30.8h, v2.8h, v6.8h\n" + "fmla v29.8h, v2.8h, v9.8h\n" + "tbz %x[n_channels], #1, 53f\n" + "ldr s11, [x19], #0x4\n" + "tbz %x[n_channels], #0, 54f\n" + "ld1 { v11.h }[2], [x19]\n" + "b 54f\n" + "53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset + "ldr h11, [x19, #0x0]\n" + "54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr h3, [x3, #0x30]\n" + "add x19, x14, x11\n" + "fmla v31.8h, v3.8h, v6.8h\n" + "fmla v30.8h, v3.8h, v8.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 55f\n" + "ldr s12, [x19], #0x4\n" + "tbz %x[n_channels], #0, 56f\n" + "ld1 { v12.h }[2], [x19]\n" + "b 56f\n" + "55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset + "ldr h12, [x19, #0x0]\n" + "56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr h4, [x3, #0x32]\n" + "add x19, x14, x10\n" + "fmla v31.8h, v4.8h, v8.8h\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 57f\n" + "ldr s9, [x19], #0x4\n" + "tbz %x[n_channels], #0, 58f\n" + "ld1 { v9.h }[2], [x19]\n" + "b 58f\n" + "57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset + "ldr h9, [x19, #0x0]\n" + "58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End + "fmla v28.8h, v4.8h, v9.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "tbz %x[n_channels], #1, 59f\n" + "mov x19, x7\n" + "st1 { v31.s }[0], [x19], x6\n" + "add x7, x7, #0x4\n" + "st1 { v30.s }[0], [x19]\n" + "mov x19, x9\n" + "st1 { v29.s }[0], [x19], x6\n" + "add x9, x9, #0x4\n" + "st1 { v28.s }[0], [x19]\n" + "tbz %x[n_channels], #0, 60f\n" + "mov x20, x7\n" + "st1 { v31.h }[2], [x20], x6\n" + "mov x19, x9\n" + "st1 { v30.h }[2], [x20]\n" + "st1 { v29.h }[2], [x19], x6\n" + "st1 { v28.h }[2], [x19]\n" + "b 60f\n" + "59:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x20, x7\n" + "st1 { v31.h }[0], [x20], x6\n" + "mov x19, x9\n" + "st1 { v30.h }[0], [x20]\n" + "st1 { v29.h }[0], [x19], x6\n" + "st1 { v28.h }[0], [x19]\n" + "60:" // Tile loop: Oddments: Store: Bit 1: End + + "61:" // Tile loop: End + "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x28, #0x1\n" + "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x27, x27, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x27, x19\n" + "csel x27, x27, XZR, LT\n" + "csel x28, x28, x21, LT\n" + "cmp x28, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..96e1ae496e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[36]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[0]; + inptrs[1] = input_ptrs[1]; + inptrs[2] = input_ptrs[6]; + inptrs[3] = input_ptrs[7]; + inptrs[4] = input_ptrs[2]; + inptrs[5] = input_ptrs[8]; + inptrs[6] = input_ptrs[3]; + inptrs[7] = input_ptrs[4]; + inptrs[8] = input_ptrs[11]; + inptrs[9] = input_ptrs[12]; + inptrs[10] = input_ptrs[9]; + inptrs[11] = input_ptrs[10]; + inptrs[12] = input_ptrs[5]; + inptrs[13] = input_ptrs[13]; + inptrs[14] = input_ptrs[14]; + inptrs[15] = input_ptrs[15]; + inptrs[16] = input_ptrs[16]; + inptrs[17] = input_ptrs[17]; + inptrs[18] = input_ptrs[18]; + inptrs[19] = input_ptrs[19]; + inptrs[20] = input_ptrs[20]; + inptrs[21] = input_ptrs[21]; + inptrs[22] = input_ptrs[22]; + inptrs[23] = input_ptrs[23]; + inptrs[24] = input_ptrs[24]; + inptrs[25] = input_ptrs[25]; + inptrs[26] = input_ptrs[26]; + inptrs[27] = input_ptrs[27]; + inptrs[28] = input_ptrs[28]; + inptrs[29] = input_ptrs[29]; + inptrs[30] = input_ptrs[30]; + inptrs[31] = input_ptrs[31]; + inptrs[32] = input_ptrs[32]; + inptrs[33] = input_ptrs[33]; + inptrs[34] = input_ptrs[34]; + inptrs[35] = input_ptrs[35]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v18.8h }, [x20]\n" + "ld1r { v17.8h }, [x19]\n" + "mov x14, #0x0\n" + "ldp x13, x12, [x21, #0x0]\n" + "mov x11, #0x10\n" // cntb _, ALL, #1 + "ldp x10, x9, [x21, #0x10]\n" + "sub x28, XZR, x11\n" + "lsr x27, %x[n_channels], #0x3\n" + "cbz x27, 3f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x11, x27, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "add x15, x15, #0x60\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldr q5, [x26, x14]\n" + "ldr q6, [x25, x14]\n" + "ldr q7, [x24, x14]\n" + "ldr q8, [x23, x14]\n" + "ldr q9, [x22, x14]\n" + "ldr q13, [x21, x14]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ldp x26, x25, [x16, #0x40]\n" + "ldr q11, [x20, x14]\n" + "ldr q12, [x19, x14]\n" + "ldr q10, [x26, x14]\n" + "ldr q14, [x25, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n" + "ldr x24, [x16, #0x50]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n" + "ldr x23, [x16, #0x58]\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n" + "ldr x22, [x16, #0x60]\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n" + "ldr q5, [x24, x14]\n" + "ldr q0, [x15, #0x0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x23, x14]\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "fmla v29.8h, v1.8h, v8.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v28.8h, v1.8h, v13.8h\n" + "ldr q1, [x15, #0x10]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "ldr q9, [x22, x14]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v13.8h\n" + "ldr x19, [x16, #0x78]\n" + "fmla v28.8h, v2.8h, v5.8h\n" + "ldr q2, [x15, #0x20]\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x21, x14]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v5.8h\n" + "ldr x25, [x16, #0x88]\n" + "fmla v28.8h, v3.8h, v6.8h\n" + "ldr q3, [x15, #0x30]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x20, x14]\n" + "fmla v30.8h, v4.8h, v9.8h\n" + "fmla v29.8h, v4.8h, v6.8h\n" + "ldr q9, [x19, x14]\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x15, #0x40]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.8h, v0.8h, v7.8h\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v30.8h, v0.8h, v8.8h\n" + "fmla v29.8h, v0.8h, v14.8h\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q0, [x15, #0x50]\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v31.8h, v1.8h, v8.8h\n" + "ldr q8, [x25, x14]\n" + "fmla v30.8h, v1.8h, v13.8h\n" + "fmla v29.8h, v1.8h, v11.8h\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q1, [x15, #0x60]\n" + "ldr x25, [x16, #0xc8]\n" + "fmla v31.8h, v2.8h, v13.8h\n" + "ldr q13, [x26, x14]\n" + "fmla v30.8h, v2.8h, v5.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v28.8h, v2.8h, v9.8h\n" + "ldr q2, [x15, #0x70]\n" + "ldr q16, [x15, #0x140]\n" + "fmla v31.8h, v3.8h, v5.8h\n" + "ldr q5, [x24, x14]\n" + "fmla v30.8h, v3.8h, v6.8h\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "ldr x24, [x16, #0xd0]\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr q3, [x15, #0x80]\n" + "fmla v31.8h, v4.8h, v6.8h\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0xd8]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "ldr q10, [x22, x14]\n" + "fmla v28.8h, v4.8h, v8.8h\n" + "ldr q4, [x15, #0x90]\n" + "ldr x22, [x16, #0xe0]\n" + "fmla v31.8h, v0.8h, v14.8h\n" + "ldr q14, [x19, x14]\n" + "fmla v30.8h, v0.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v5.8h\n" + "ldr x19, [x16, #0xf8]\n" + "fmla v28.8h, v0.8h, v6.8h\n" + "ldr q0, [x15, #0xa0]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x21, x14]\n" + "ldr x21, [x16, #0xe8]\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v6.8h\n" + "fmla v28.8h, v1.8h, v10.8h\n" + "ldr q1, [x15, #0xb0]\n" + "fmla v31.8h, v2.8h, v12.8h\n" + "ldr q12, [x20, x14]\n" + "ldr x20, [x16, #0xf0]\n" + "fmla v30.8h, v2.8h, v9.8h\n" + "fmla v29.8h, v2.8h, v10.8h\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr q2, [x15, #0xc0]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ldr q9, [x26, x14]\n" + "ldr x26, [x16, #0x100]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr q3, [x15, #0xd0]\n" + "fmla v31.8h, v4.8h, v13.8h\n" + "ldr q13, [x25, x14]\n" + "ldr x25, [x16, #0x108]\n" + "fmla v30.8h, v4.8h, v8.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "ldr q8, [x22, x14]\n" + "fmla v28.8h, v4.8h, v14.8h\n" + "ldr q4, [x15, #0xe0]\n" + "fmla v31.8h, v0.8h, v5.8h\n" + "ldr q5, [x24, x14]\n" + "ldr x24, [x16, #0x110]\n" + "fmla v30.8h, v0.8h, v6.8h\n" + "fmla v29.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v0.8h, v13.8h\n" + "ldr q0, [x15, #0xf0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0x118]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v1.8h, v13.8h\n" + "fmla v28.8h, v1.8h, v5.8h\n" + "ldr q1, [x15, #0x100]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr q10, [x21, x14]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v5.8h\n" + "fmla v28.8h, v2.8h, v6.8h\n" + "ldr q2, [x15, #0x110]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x20, x14]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v6.8h\n" + "fmla v28.8h, v3.8h, v8.8h\n" + "ldr q3, [x15, #0x120]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x19, x14]\n" + "fmla v30.8h, v4.8h, v14.8h\n" + "fmla v29.8h, v4.8h, v8.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x15, #0x130]\n" + "fmla v31.8h, v0.8h, v9.8h\n" + "ldr q9, [x26, x14]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x25, x14]\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldr q0, [x15, #0x150]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "fmla v30.8h, v1.8h, v5.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "ldr q12, [x24, x14]\n" + "fmla v28.8h, v1.8h, v9.8h\n" + "ldr q1, [x15, #0x160]\n" + "fmla v31.8h, v2.8h, v5.8h\n" + "ldr q5, [x26, x11]\n" + "fmla v30.8h, v2.8h, v6.8h\n" + "fmla v29.8h, v2.8h, v9.8h\n" + "ldr q9, [x23, x14]\n" + "add x14, x14, #0x10\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldp x22, x21, [x16, #0x20]\n" + "fmla v31.8h, v3.8h, v6.8h\n" + "ldr q6, [x25, x11]\n" + "fmla v30.8h, v3.8h, v8.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "ldr q7, [x24, x11]\n" + "ldr q13, [x21, x11]\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldp x20, x19, [x16, #0x30]\n" + "fmla v31.8h, v4.8h, v8.8h\n" + "ldr q8, [x23, x11]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "ldr q11, [x20, x11]\n" + "ldr q12, [x19, x11]\n" + "fmla v28.8h, v4.8h, v9.8h\n" + "ldr q9, [x22, x11]\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "ldp x26, x25, [x16, #0x40]\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "ldr q2, [x15, #0x170]\n" + "ldr q3, [x15, #0x180]\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "ldr q10, [x26, x11]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "ldr q14, [x25, x11]\n" + "add x11, x11, #0x10\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "str q31, [x13, x28]\n" + "cmp x11, x27, LSL #4\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q30, [x12, x28]\n" + "ldr q4, [x15, #0x190]\n" + "add x15, x15, #0x1a0\n" + "str q29, [x10, x28]\n" + "str q28, [x9, x28]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n" + "ldr x24, [x16, #0x50]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n" + "ldr x23, [x16, #0x58]\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n" + "ldr x22, [x16, #0x60]\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n" + "ldr q5, [x24, x14]\n" + "ldr q0, [x15, #0x0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x23, x14]\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "fmla v29.8h, v1.8h, v8.8h\n" + "ldr x21, [x16, #0x68]\n" + "fmla v28.8h, v1.8h, v13.8h\n" + "ldr q1, [x15, #0x10]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "ldr q9, [x22, x14]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v13.8h\n" + "ldr x19, [x16, #0x78]\n" + "fmla v28.8h, v2.8h, v5.8h\n" + "ldr q2, [x15, #0x20]\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x21, x14]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v5.8h\n" + "ldr x25, [x16, #0x88]\n" + "fmla v28.8h, v3.8h, v6.8h\n" + "ldr q3, [x15, #0x30]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x20, x14]\n" + "fmla v30.8h, v4.8h, v9.8h\n" + "fmla v29.8h, v4.8h, v6.8h\n" + "ldr q9, [x19, x14]\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x15, #0x40]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.8h, v0.8h, v7.8h\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v30.8h, v0.8h, v8.8h\n" + "fmla v29.8h, v0.8h, v14.8h\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr q0, [x15, #0x50]\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v31.8h, v1.8h, v8.8h\n" + "ldr q8, [x25, x14]\n" + "fmla v30.8h, v1.8h, v13.8h\n" + "fmla v29.8h, v1.8h, v11.8h\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr q1, [x15, #0x60]\n" + "ldr x25, [x16, #0xc8]\n" + "fmla v31.8h, v2.8h, v13.8h\n" + "ldr q13, [x26, x14]\n" + "fmla v30.8h, v2.8h, v5.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v28.8h, v2.8h, v9.8h\n" + "ldr q2, [x15, #0x70]\n" + "fmla v31.8h, v3.8h, v5.8h\n" + "ldr q5, [x24, x14]\n" + "ldr x24, [x16, #0xd0]\n" + "fmla v30.8h, v3.8h, v6.8h\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr q3, [x15, #0x80]\n" + "fmla v31.8h, v4.8h, v6.8h\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0xd8]\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "ldr q10, [x22, x14]\n" + "fmla v28.8h, v4.8h, v8.8h\n" + "ldr q4, [x15, #0x90]\n" + "ldr x22, [x16, #0xe0]\n" + "fmla v31.8h, v0.8h, v14.8h\n" + "ldr q14, [x19, x14]\n" + "fmla v30.8h, v0.8h, v11.8h\n" + "fmla v29.8h, v0.8h, v5.8h\n" + "ldr x19, [x16, #0xf8]\n" + "fmla v28.8h, v0.8h, v6.8h\n" + "ldr q0, [x15, #0xa0]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr q11, [x21, x14]\n" + "ldr x21, [x16, #0xe8]\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v6.8h\n" + "fmla v28.8h, v1.8h, v10.8h\n" + "ldr q1, [x15, #0xb0]\n" + "fmla v31.8h, v2.8h, v12.8h\n" + "ldr q12, [x20, x14]\n" + "ldr x20, [x16, #0xf0]\n" + "fmla v30.8h, v2.8h, v9.8h\n" + "fmla v29.8h, v2.8h, v10.8h\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr q2, [x15, #0xc0]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ldr q9, [x26, x14]\n" + "ldr x26, [x16, #0x100]\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr q3, [x15, #0xd0]\n" + "fmla v31.8h, v4.8h, v13.8h\n" + "ldr q13, [x25, x14]\n" + "ldr x25, [x16, #0x108]\n" + "fmla v30.8h, v4.8h, v8.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "ldr q8, [x22, x14]\n" + "fmla v28.8h, v4.8h, v14.8h\n" + "ldr q4, [x15, #0xe0]\n" + "fmla v31.8h, v0.8h, v5.8h\n" + "ldr q5, [x24, x14]\n" + "ldr x24, [x16, #0x110]\n" + "fmla v30.8h, v0.8h, v6.8h\n" + "fmla v29.8h, v0.8h, v9.8h\n" + "fmla v28.8h, v0.8h, v13.8h\n" + "ldr q0, [x15, #0xf0]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0x118]\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v1.8h, v13.8h\n" + "fmla v28.8h, v1.8h, v5.8h\n" + "ldr q1, [x15, #0x100]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr q10, [x21, x14]\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v5.8h\n" + "fmla v28.8h, v2.8h, v6.8h\n" + "ldr q2, [x15, #0x110]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr q11, [x20, x14]\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v6.8h\n" + "fmla v28.8h, v3.8h, v8.8h\n" + "ldr q3, [x15, #0x120]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr q12, [x19, x14]\n" + "fmla v30.8h, v4.8h, v14.8h\n" + "fmla v29.8h, v4.8h, v8.8h\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr q4, [x15, #0x130]\n" + "add x15, x15, #0x140\n" + "fmla v31.8h, v0.8h, v9.8h\n" + "ldr q9, [x26, x14]\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr q11, [x25, x14]\n" + "fmla v28.8h, v0.8h, v12.8h\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "fmla v30.8h, v1.8h, v5.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "ldr q12, [x24, x14]\n" + "fmla v28.8h, v1.8h, v9.8h\n" + "fmla v31.8h, v2.8h, v5.8h\n" + "fmla v30.8h, v2.8h, v6.8h\n" + "fmla v29.8h, v2.8h, v9.8h\n" + "ldr q9, [x23, x14]\n" + "add x14, x14, #0x10\n" + "fmla v28.8h, v2.8h, v11.8h\n" + "fmla v31.8h, v3.8h, v6.8h\n" + "fmla v30.8h, v3.8h, v8.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "fmla v28.8h, v3.8h, v12.8h\n" + "fmla v31.8h, v4.8h, v8.8h\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "fmla v28.8h, v4.8h, v9.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "str q31, [x13, x28]\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "str q30, [x12, x28]\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "str q29, [x10, x28]\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "str q28, [x9, x28]\n" + "3:" // Oddments + "tst %x[n_channels], #0x1\n" + "beq 60f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x28, x14\n" + "ldr q1, [x15, #0x20]\n" + "add x13, x13, x28\n" + "ldr q2, [x15, #0x30]\n" + "add x12, x12, x28\n" + "ldr q3, [x15, #0x40]\n" + "add x10, x10, x28\n" + "ldr q4, [x15, #0x50]\n" + "add x9, x9, x28\n" + "ldr x24, [x16, #0x10]\n" + "ldr x23, [x16, #0x18]\n" + "ldr x22, [x16, #0x20]\n" + "add x24, x24, x14\n" + "ldr x21, [x16, #0x28]\n" + "add x23, x23, x14\n" + "ldr x20, [x16, #0x30]\n" + "add x22, x22, x14\n" + "ldr x19, [x16, #0x38]\n" + "add x21, x21, x14\n" + "ldr x26, [x16, #0x40]\n" + "add x20, x20, x14\n" + "ldr x25, [x16, #0x48]\n" + "add x19, x19, x14\n" + "add x26, x26, x14\n" + "add x25, x25, x14\n" + "add x15, x15, #0x60\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v5.s }[0], [x26], #0x4\n" + "ld1 { v6.s }[0], [x25], #0x4\n" + "ld1 { v7.s }[0], [x24], #0x4\n" + "ld1 { v8.s }[0], [x23], #0x4\n" + "ld1 { v9.s }[0], [x22], #0x4\n" + "ld1 { v13.s }[0], [x21], #0x4\n" + "ld1 { v11.s }[0], [x20], #0x4\n" + "ld1 { v12.s }[0], [x19], #0x4\n" + "ld1 { v10.s }[0], [x26], #0x4\n" + "ld1 { v14.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v7.h }[2], [x24], #0x2\n" + "ld1 { v8.h }[2], [x23], #0x2\n" + "ld1 { v5.h }[2], [x26], #0x2\n" + "ld1 { v6.h }[2], [x25], #0x2\n" + "ld1 { v9.h }[2], [x22], #0x2\n" + "ld1 { v13.h }[2], [x21], #0x2\n" + "ld1 { v11.h }[2], [x20], #0x2\n" + "ld1 { v12.h }[2], [x19], #0x2\n" + "ld1 { v10.h }[2], [x26], #0x2\n" + "ld1 { v14.h }[2], [x25], #0x2\n" + "b 5f\n" + "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset + "ld1 { v5.h }[0], [x26], #0x2\n" + "ld1 { v6.h }[0], [x25], #0x2\n" + "ld1 { v7.h }[0], [x24], #0x2\n" + "ld1 { v8.h }[0], [x23], #0x2\n" + "ld1 { v9.h }[0], [x22], #0x2\n" + "ld1 { v13.h }[0], [x21], #0x2\n" + "ld1 { v11.h }[0], [x20], #0x2\n" + "ld1 { v12.h }[0], [x19], #0x2\n" + "ld1 { v10.h }[0], [x26], #0x2\n" + "ld1 { v14.h }[0], [x25], #0x2\n" + "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n" + "ldr x24, [x16, #0x50]\n" + "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n" + "add x24, x24, x14\n" + "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n" + "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "fmla v30.8h, v1.8h, v9.8h\n" + "fmla v29.8h, v1.8h, v8.8h\n" + "fmla v28.8h, v1.8h, v13.8h\n" + "fmla v31.8h, v2.8h, v9.8h\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v13.8h\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v5.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v5.h }[2], [x24], #0x2\n" + "b 7f\n" + "6:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v5.h }[0], [x24], #0x2\n" + "7:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v28.8h, v2.8h, v5.8h\n" + "ldr x23, [x16, #0x58]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "add x23, x23, x14\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v5.8h\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v6.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v6.h }[2], [x23], #0x2\n" + "b 9f\n" + "8:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v6.h }[0], [x23], #0x2\n" + "9:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v28.8h, v3.8h, v6.8h\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v9.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v9.h }[2], [x22], #0x2\n" + "b 11f\n" + "10:" // Oddments: Load input (0, 5): Bit 1: Unset + "ld1 { v9.h }[0], [x22], #0x2\n" + "11:" // Oddments: Load input (0, 5): Bit 1: End + "fmla v30.8h, v4.8h, v9.8h\n" + "ldr h0, [x15, #0xc]\n" + "fmla v29.8h, v4.8h, v6.8h\n" + "ldr x21, [x16, #0x68]\n" + "add x21, x21, x14\n" + "fmla v28.8h, v4.8h, v10.8h\n" + "fmla v31.8h, v0.8h, v7.8h\n" + "fmla v30.8h, v0.8h, v8.8h\n" + "fmla v29.8h, v0.8h, v14.8h\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v11.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v11.h }[2], [x21], #0x2\n" + "b 13f\n" + "12:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v11.h }[0], [x21], #0x2\n" + "13:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v28.8h, v0.8h, v11.8h\n" + "ldr h1, [x15, #0xe]\n" + "fmla v31.8h, v1.8h, v8.8h\n" + "ldr x20, [x16, #0x70]\n" + "add x20, x20, x14\n" + "fmla v30.8h, v1.8h, v13.8h\n" + "fmla v29.8h, v1.8h, v11.8h\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v12.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v12.h }[2], [x20], #0x2\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 2): Bit 1: Unset + "ld1 { v12.h }[0], [x20], #0x2\n" + "15:" // Oddments: Load input (2, 2): Bit 1: End + "fmla v28.8h, v1.8h, v12.8h\n" + "ldr h2, [x15, #0x10]\n" + "fmla v31.8h, v2.8h, v13.8h\n" + "ldr x19, [x16, #0x78]\n" + "add x19, x19, x14\n" + "fmla v30.8h, v2.8h, v5.8h\n" + "fmla v29.8h, v2.8h, v12.8h\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v9.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v9.h }[2], [x19], #0x2\n" + "b 17f\n" + "16:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v9.h }[0], [x19], #0x2\n" + "17:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v28.8h, v2.8h, v9.8h\n" + "ldr h3, [x15, #0x12]\n" + "fmla v31.8h, v3.8h, v5.8h\n" + "ldr x26, [x16, #0x80]\n" + "add x26, x26, x14\n" + "fmla v30.8h, v3.8h, v6.8h\n" + "fmla v29.8h, v3.8h, v9.8h\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v13.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v13.h }[2], [x26], #0x2\n" + "b 19f\n" + "18:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v13.h }[0], [x26], #0x2\n" + "19:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v28.8h, v3.8h, v13.8h\n" + "ldr h4, [x15, #0x14]\n" + "fmla v31.8h, v4.8h, v6.8h\n" + "ldr x25, [x16, #0x88]\n" + "add x25, x25, x14\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v13.8h\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v8.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v8.h }[2], [x25], #0x2\n" + "b 21f\n" + "20:" // Oddments: Load input (2, 5): Bit 1: Unset + "ld1 { v8.h }[0], [x25], #0x2\n" + "21:" // Oddments: Load input (2, 5): Bit 1: End + "fmla v28.8h, v4.8h, v8.8h\n" + "ldr h0, [x15, #0x16]\n" + "fmla v31.8h, v0.8h, v14.8h\n" + "ldr x24, [x16, #0x90]\n" + "add x24, x24, x14\n" + "fmla v30.8h, v0.8h, v11.8h\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v5.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v5.h }[2], [x24], #0x2\n" + "b 23f\n" + "22:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v5.h }[0], [x24], #0x2\n" + "23:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v29.8h, v0.8h, v5.8h\n" + "ldr x23, [x16, #0x98]\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v6.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v6.h }[2], [x23], #0x2\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v6.h }[0], [x23], #0x2\n" + "25:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v28.8h, v0.8h, v6.8h\n" + "ldr h1, [x15, #0x18]\n" + "fmla v31.8h, v1.8h, v11.8h\n" + "ldr x22, [x16, #0xa0]\n" + "add x22, x22, x14\n" + "fmla v30.8h, v1.8h, v12.8h\n" + "fmla v29.8h, v1.8h, v6.8h\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v10.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v10.h }[2], [x22], #0x2\n" + "b 27f\n" + "26:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v10.h }[0], [x22], #0x2\n" + "27:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v28.8h, v1.8h, v10.8h\n" + "ldr h2, [x15, #0x1a]\n" + "fmla v31.8h, v2.8h, v12.8h\n" + "ldr x21, [x16, #0xa8]\n" + "add x21, x21, x14\n" + "fmla v30.8h, v2.8h, v9.8h\n" + "fmla v29.8h, v2.8h, v10.8h\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v11.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v11.h }[2], [x21], #0x2\n" + "b 29f\n" + "28:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v11.h }[0], [x21], #0x2\n" + "29:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr h3, [x15, #0x1c]\n" + "fmla v31.8h, v3.8h, v9.8h\n" + "ldr x20, [x16, #0xb0]\n" + "add x20, x20, x14\n" + "fmla v30.8h, v3.8h, v13.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v12.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v12.h }[2], [x20], #0x2\n" + "b 31f\n" + "30:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v12.h }[0], [x20], #0x2\n" + "31:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr h4, [x15, #0x1e]\n" + "fmla v31.8h, v4.8h, v13.8h\n" + "ldr x19, [x16, #0xb8]\n" + "add x19, x19, x14\n" + "fmla v30.8h, v4.8h, v8.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v14.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v14.h }[2], [x19], #0x2\n" + "b 33f\n" + "32:" // Oddments: Load input (3, 5): Bit 1: Unset + "ld1 { v14.h }[0], [x19], #0x2\n" + "33:" // Oddments: Load input (3, 5): Bit 1: End + "fmla v28.8h, v4.8h, v14.8h\n" + "ldr h0, [x15, #0x20]\n" + "fmla v31.8h, v0.8h, v5.8h\n" + "ldr x26, [x16, #0xc0]\n" + "add x26, x26, x14\n" + "fmla v30.8h, v0.8h, v6.8h\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v9.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v9.h }[2], [x26], #0x2\n" + "b 35f\n" + "34:" // Oddments: Load input (4, 0): Bit 1: Unset + "ld1 { v9.h }[0], [x26], #0x2\n" + "35:" // Oddments: Load input (4, 0): Bit 1: End + "fmla v29.8h, v0.8h, v9.8h\n" + "ldr x25, [x16, #0xc8]\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v13.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v13.h }[2], [x25], #0x2\n" + "b 37f\n" + "36:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v13.h }[0], [x25], #0x2\n" + "37:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v28.8h, v0.8h, v13.8h\n" + "ldr h1, [x15, #0x22]\n" + "fmla v31.8h, v1.8h, v6.8h\n" + "ldr x24, [x16, #0xd0]\n" + "add x24, x24, x14\n" + "fmla v30.8h, v1.8h, v10.8h\n" + "fmla v29.8h, v1.8h, v13.8h\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v5.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v5.h }[2], [x24], #0x2\n" + "b 39f\n" + "38:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v5.h }[0], [x24], #0x2\n" + "39:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v28.8h, v1.8h, v5.8h\n" + "ldr h2, [x15, #0x24]\n" + "fmla v31.8h, v2.8h, v10.8h\n" + "ldr x23, [x16, #0xd8]\n" + "add x23, x23, x14\n" + "fmla v30.8h, v2.8h, v11.8h\n" + "fmla v29.8h, v2.8h, v5.8h\n" + "tbz %x[n_channels], #1, 40f\n" + "ld1 { v6.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 41f\n" + "ld1 { v6.h }[2], [x23], #0x2\n" + "b 41f\n" + "40:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v6.h }[0], [x23], #0x2\n" + "41:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v28.8h, v2.8h, v6.8h\n" + "ldr h3, [x15, #0x26]\n" + "fmla v31.8h, v3.8h, v11.8h\n" + "ldr x22, [x16, #0xe0]\n" + "add x22, x22, x14\n" + "fmla v30.8h, v3.8h, v12.8h\n" + "fmla v29.8h, v3.8h, v6.8h\n" + "tbz %x[n_channels], #1, 42f\n" + "ld1 { v8.s }[0], [x22], #0x4\n" + "tbz %x[n_channels], #0, 43f\n" + "ld1 { v8.h }[2], [x22], #0x2\n" + "b 43f\n" + "42:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v8.h }[0], [x22], #0x2\n" + "43:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v28.8h, v3.8h, v8.8h\n" + "ldr h4, [x15, #0x28]\n" + "fmla v31.8h, v4.8h, v12.8h\n" + "ldr x21, [x16, #0xe8]\n" + "add x21, x21, x14\n" + "fmla v30.8h, v4.8h, v14.8h\n" + "fmla v29.8h, v4.8h, v8.8h\n" + "tbz %x[n_channels], #1, 44f\n" + "ld1 { v10.s }[0], [x21], #0x4\n" + "tbz %x[n_channels], #0, 45f\n" + "ld1 { v10.h }[2], [x21], #0x2\n" + "b 45f\n" + "44:" // Oddments: Load input (4, 5): Bit 1: Unset + "ld1 { v10.h }[0], [x21], #0x2\n" + "45:" // Oddments: Load input (4, 5): Bit 1: End + "fmla v28.8h, v4.8h, v10.8h\n" + "ldr h0, [x15, #0x2a]\n" + "fmla v31.8h, v0.8h, v9.8h\n" + "ldr x20, [x16, #0xf0]\n" + "add x20, x20, x14\n" + "fmla v30.8h, v0.8h, v13.8h\n" + "tbz %x[n_channels], #1, 46f\n" + "ld1 { v11.s }[0], [x20], #0x4\n" + "tbz %x[n_channels], #0, 47f\n" + "ld1 { v11.h }[2], [x20], #0x2\n" + "b 47f\n" + "46:" // Oddments: Load input (5, 0): Bit 1: Unset + "ld1 { v11.h }[0], [x20], #0x2\n" + "47:" // Oddments: Load input (5, 0): Bit 1: End + "fmla v29.8h, v0.8h, v11.8h\n" + "ldr x19, [x16, #0xf8]\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 48f\n" + "ld1 { v12.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 49f\n" + "ld1 { v12.h }[2], [x19], #0x2\n" + "b 49f\n" + "48:" // Oddments: Load input (5, 1): Bit 1: Unset + "ld1 { v12.h }[0], [x19], #0x2\n" + "49:" // Oddments: Load input (5, 1): Bit 1: End + "fmla v28.8h, v0.8h, v12.8h\n" + "ldr h1, [x15, #0x2c]\n" + "fmla v31.8h, v1.8h, v13.8h\n" + "ldr x26, [x16, #0x100]\n" + "add x26, x26, x14\n" + "fmla v30.8h, v1.8h, v5.8h\n" + "fmla v29.8h, v1.8h, v12.8h\n" + "tbz %x[n_channels], #1, 50f\n" + "ld1 { v9.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #0, 51f\n" + "ld1 { v9.h }[2], [x26], #0x2\n" + "b 51f\n" + "50:" // Oddments: Load input (5, 2): Bit 1: Unset + "ld1 { v9.h }[0], [x26], #0x2\n" + "51:" // Oddments: Load input (5, 2): Bit 1: End + "fmla v28.8h, v1.8h, v9.8h\n" + "ldr h2, [x15, #0x2e]\n" + "fmla v31.8h, v2.8h, v5.8h\n" + "ldr x25, [x16, #0x108]\n" + "add x25, x25, x14\n" + "fmla v30.8h, v2.8h, v6.8h\n" + "fmla v29.8h, v2.8h, v9.8h\n" + "tbz %x[n_channels], #1, 52f\n" + "ld1 { v11.s }[0], [x25], #0x4\n" + "tbz %x[n_channels], #0, 53f\n" + "ld1 { v11.h }[2], [x25], #0x2\n" + "b 53f\n" + "52:" // Oddments: Load input (5, 3): Bit 1: Unset + "ld1 { v11.h }[0], [x25], #0x2\n" + "53:" // Oddments: Load input (5, 3): Bit 1: End + "fmla v28.8h, v2.8h, v11.8h\n" + "ldr h3, [x15, #0x30]\n" + "fmla v31.8h, v3.8h, v6.8h\n" + "ldr x24, [x16, #0x110]\n" + "add x24, x24, x14\n" + "fmla v30.8h, v3.8h, v8.8h\n" + "fmla v29.8h, v3.8h, v11.8h\n" + "tbz %x[n_channels], #1, 54f\n" + "ld1 { v12.s }[0], [x24], #0x4\n" + "tbz %x[n_channels], #0, 55f\n" + "ld1 { v12.h }[2], [x24], #0x2\n" + "b 55f\n" + "54:" // Oddments: Load input (5, 4): Bit 1: Unset + "ld1 { v12.h }[0], [x24], #0x2\n" + "55:" // Oddments: Load input (5, 4): Bit 1: End + "fmla v28.8h, v3.8h, v12.8h\n" + "ldr h4, [x15, #0x32]\n" + "fmla v31.8h, v4.8h, v8.8h\n" + "ldr x23, [x16, #0x118]\n" + "add x23, x23, x14\n" + "fmla v30.8h, v4.8h, v10.8h\n" + "fmla v29.8h, v4.8h, v12.8h\n" + "tbz %x[n_channels], #1, 56f\n" + "ld1 { v9.s }[0], [x23], #0x4\n" + "tbz %x[n_channels], #0, 57f\n" + "ld1 { v9.h }[2], [x23], #0x2\n" + "b 57f\n" + "56:" // Oddments: Load input (5, 5): Bit 1: Unset + "ld1 { v9.h }[0], [x23], #0x2\n" + "57:" // Oddments: Load input (5, 5): Bit 1: End + "fmla v28.8h, v4.8h, v9.8h\n" + "fmax v31.8h, v31.8h, v18.8h\n" + "fmax v30.8h, v30.8h, v18.8h\n" + "fmax v29.8h, v29.8h, v18.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v28.8h, v28.8h, v18.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "tbz %x[n_channels], #1, 58f\n" + "st1 { v31.s }[0], [x13], #0x4\n" + "st1 { v30.s }[0], [x12], #0x4\n" + "st1 { v29.s }[0], [x10], #0x4\n" + "st1 { v28.s }[0], [x9], #0x4\n" + "tbz %x[n_channels], #0, 59f\n" + "st1 { v31.h }[2], [x13], #0x2\n" + "st1 { v30.h }[2], [x12], #0x2\n" + "st1 { v29.h }[2], [x10], #0x2\n" + "st1 { v28.h }[2], [x9], #0x2\n" + "b 59f\n" + "58:" // Oddments: Store: Bit 1: Unset + "st1 { v31.h }[0], [x13], #0x2\n" + "st1 { v30.h }[0], [x12], #0x2\n" + "st1 { v29.h }[0], [x10], #0x2\n" + "st1 { v28.h }[0], [x9], #0x2\n" + "59:" // Oddments: Store: Bit 1: End + + "60:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp new file mode 100644 index 0000000000..3468b70f29 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16); + +struct a64_fp16_nhwc_generic_output9_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int n_output_points = 9; + + kern_type kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl; + + a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..8ac79f82fa --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl( + const __fp16 *const *const inptrs, + __fp16 *const *const outptrs, + const void *params, + const void *bias, + const unsigned int n_points, + const unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + const __fp16 minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ld1r { v4.8h }, [%x[minmax_vals]]\n" + "add x19, %x[minmax_vals], #0x2\n" + "mov x11, #0x0\n" + "ld1r { v3.8h }, [x19]\n" + "lsr x10, %x[n_channels], #0x3\n" + "cbz x10, 5f\n" + "1:" // Channel loop + "movi v25.16b, #0x0\n" + "cbz %x[bias], 2f\n" + "ldr q25, [%x[bias], x11]\n" + "2:" // Channel loop: Load bias: Done + "mov v24.16b, v25.16b\n" + "ldr q23, [%x[params], #0x0]\n" + "mov x20, %x[inptrs]\n" + "mov v22.16b, v25.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, %x[n_points], #0x1\n" + "mov v21.16b, v25.16b\n" + "ldr q2, [x9, x11]\n" + "mov v20.16b, v25.16b\n" + "add %x[params], %x[params], #0x10\n" + "mov v19.16b, v25.16b\n" + "ldr q1, [x28, x11]\n" + "mov v18.16b, v25.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v17.16b, v25.16b\n" + "ldr q0, [x27, x11]\n" + "mov v16.16b, v25.16b\n" + "ldr q31, [x26, x11]\n" + "ldp x25, x24, [x20], #0x10\n" + "ldr q30, [x25, x11]\n" + "ldr q29, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "ldr q28, [x23, x11]\n" + "ldr q27, [x22, x11]\n" + "ldr x21, [x20], #0x8\n" + "ldr q26, [x21, x11]\n" + "ble 4f\n" + "3:" // Channel loop: Planar loop + "fmla v25.8h, v2.8h, v23.8h\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, x19, #0x1\n" + "fmla v24.8h, v1.8h, v23.8h\n" + "ldr q2, [x9, x11]\n" + "fmla v22.8h, v0.8h, v23.8h\n" + "fmla v21.8h, v31.8h, v23.8h\n" + "ldr q1, [x28, x11]\n" + "fmla v20.8h, v30.8h, v23.8h\n" + "ldp x27, x26, [x20], #0x10\n" + "fmla v19.8h, v29.8h, v23.8h\n" + "fmla v18.8h, v28.8h, v23.8h\n" + "ldr q0, [x27, x11]\n" + "fmla v17.8h, v27.8h, v23.8h\n" + "fmla v16.8h, v26.8h, v23.8h\n" + "ldr q23, [%x[params], #0x0]\n" + "add %x[params], %x[params], #0x10\n" + "ldr q31, [x26, x11]\n" + "ldp x25, x24, [x20], #0x10\n" + "ldr q30, [x25, x11]\n" + "ldr q29, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "ldr q28, [x23, x11]\n" + "ldr q27, [x22, x11]\n" + "ldr x21, [x20], #0x8\n" + "ldr q26, [x21, x11]\n" + "bgt 3b\n" + "4:" // Channel loop: Planar tail + "fmla v25.8h, v2.8h, v23.8h\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "fmla v24.8h, v1.8h, v23.8h\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "fmla v22.8h, v0.8h, v23.8h\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "fmla v21.8h, v31.8h, v23.8h\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "fmla v20.8h, v30.8h, v23.8h\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmla v19.8h, v29.8h, v23.8h\n" + "fmla v18.8h, v28.8h, v23.8h\n" + "fmla v17.8h, v27.8h, v23.8h\n" + "fmla v16.8h, v26.8h, v23.8h\n" + "fmax v25.8h, v25.8h, v4.8h\n" + "fmax v24.8h, v24.8h, v4.8h\n" + "fmax v22.8h, v22.8h, v4.8h\n" + "fmin v25.8h, v25.8h, v3.8h\n" + "str q25, [x27, x11]\n" + "fmin v24.8h, v24.8h, v3.8h\n" + "fmin v22.8h, v22.8h, v3.8h\n" + "str q24, [x26, x11]\n" + "fmax v21.8h, v21.8h, v4.8h\n" + "fmax v20.8h, v20.8h, v4.8h\n" + "str q22, [x25, x11]\n" + "fmax v19.8h, v19.8h, v4.8h\n" + "fmax v18.8h, v18.8h, v4.8h\n" + "fmin v21.8h, v21.8h, v3.8h\n" + "str q21, [x24, x11]\n" + "fmin v20.8h, v20.8h, v3.8h\n" + "fmin v19.8h, v19.8h, v3.8h\n" + "str q20, [x23, x11]\n" + "fmin v18.8h, v18.8h, v3.8h\n" + "fmax v17.8h, v17.8h, v4.8h\n" + "str q19, [x22, x11]\n" + "fmax v16.8h, v16.8h, v4.8h\n" + "str q18, [x21, x11]\n" + "fmin v17.8h, v17.8h, v3.8h\n" + "fmin v16.8h, v16.8h, v3.8h\n" + "str q17, [x20, x11]\n" + "str q16, [x19, x11]\n" + "add x11, x11, #0x10\n" + "cmp x11, x10, LSL #4\n" + "blt 1b\n" + "5:" // Oddments + "tst %x[n_channels], #0x7\n" + "beq 25f\n" + "movi v25.16b, #0x0\n" + "cbz %x[bias], 10f\n" + "add x19, %x[bias], x11\n" + "tbz %x[n_channels], #2, 7f\n" + "ld1 { v25.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v25.s }[2], [x19], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v25.h }[6], [x19], #0x2\n" + "b 9f\n" + "6:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v25.h }[4], [x19], #0x2\n" + "b 9f\n" + "7:" // Oddments: Load bias: Bit 2: Unset + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v25.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v25.h }[2], [x19], #0x2\n" + "b 9f\n" + "8:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v25.h }[0], [x19], #0x2\n" + "9:" // Oddments: Load bias: Bit 2: End + + "10:" // Oddments: Load bias: Done + "mov v24.16b, v25.16b\n" + "ldr q23, [%x[params], #0x0]\n" + "mov x20, %x[inptrs]\n" + "mov v22.16b, v25.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "add %x[params], %x[params], #0x10\n" + "mov v21.16b, v25.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v20.16b, v25.16b\n" + "add x9, x9, x11\n" + "mov v19.16b, v25.16b\n" + "ldp x25, x24, [x20], #0x10\n" + "mov v18.16b, v25.16b\n" + "add x28, x28, x11\n" + "mov v17.16b, v25.16b\n" + "ldp x23, x22, [x20], #0x10\n" + "mov v16.16b, v25.16b\n" + "add x27, x27, x11\n" + "ldr x21, [x20], #0x8\n" + "add x26, x26, x11\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #2, 12f\n" + "ldr d2, [x9], #0x8\n" + "ldr d1, [x28], #0x8\n" + "ldr d0, [x27], #0x8\n" + "ldr d31, [x26], #0x8\n" + "ldr d30, [x25], #0x8\n" + "ldr d29, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz %x[n_channels], #1, 11f\n" + "ld1 { v2.s }[2], [x9], #0x4\n" + "ld1 { v1.s }[2], [x28], #0x4\n" + "ld1 { v0.s }[2], [x27], #0x4\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v30.s }[2], [x25], #0x4\n" + "ld1 { v29.s }[2], [x24], #0x4\n" + "ld1 { v28.s }[2], [x23], #0x4\n" + "ld1 { v27.s }[2], [x22], #0x4\n" + "ld1 { v26.s }[2], [x21], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v2.h }[6], [x9], #0x2\n" + "ld1 { v1.h }[6], [x28], #0x2\n" + "ld1 { v0.h }[6], [x27], #0x2\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v30.h }[6], [x25], #0x2\n" + "ld1 { v29.h }[6], [x24], #0x2\n" + "ld1 { v28.h }[6], [x23], #0x2\n" + "ld1 { v27.h }[6], [x22], #0x2\n" + "ld1 { v26.h }[6], [x21], #0x2\n" + "b 14f\n" + "11:" // Oddments: Load: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v2.h }[4], [x9], #0x2\n" + "ld1 { v1.h }[4], [x28], #0x2\n" + "ld1 { v0.h }[4], [x27], #0x2\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v30.h }[4], [x25], #0x2\n" + "ld1 { v29.h }[4], [x24], #0x2\n" + "ld1 { v28.h }[4], [x23], #0x2\n" + "ld1 { v27.h }[4], [x22], #0x2\n" + "ld1 { v26.h }[4], [x21], #0x2\n" + "b 14f\n" + "12:" // Oddments: Load: Bit 2: Unset + "tbz %x[n_channels], #1, 13f\n" + "ldr s2, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s0, [x27], #0x4\n" + "ldr s31, [x26], #0x4\n" + "ldr s30, [x25], #0x4\n" + "ldr s29, [x24], #0x4\n" + "ldr s28, [x23], #0x4\n" + "ldr s27, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v2.h }[2], [x9], #0x2\n" + "ld1 { v1.h }[2], [x28], #0x2\n" + "ld1 { v0.h }[2], [x27], #0x2\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v30.h }[2], [x25], #0x2\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "b 14f\n" + "13:" // Oddments: Load: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 14f\n" + "ldr h2, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h0, [x27], #0x2\n" + "ldr h31, [x26], #0x2\n" + "ldr h30, [x25], #0x2\n" + "ldr h29, [x24], #0x2\n" + "ldr h28, [x23], #0x2\n" + "ldr h27, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" + "14:" // Oddments: Load: Bit 2: End + "subs x19, %x[n_points], #0x1\n" + "ble 20f\n" + "15:" // Oddments: Planar loop + "fmla v25.8h, v2.8h, v23.8h\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "fmla v24.8h, v1.8h, v23.8h\n" + "ldp x27, x26, [x20], #0x10\n" + "fmla v22.8h, v0.8h, v23.8h\n" + "ldp x25, x24, [x20], #0x10\n" + "fmla v21.8h, v31.8h, v23.8h\n" + "add x28, x28, x11\n" + "fmla v20.8h, v30.8h, v23.8h\n" + "ldp x23, x22, [x20], #0x10\n" + "fmla v19.8h, v29.8h, v23.8h\n" + "add x27, x27, x11\n" + "fmla v18.8h, v28.8h, v23.8h\n" + "ldr x21, [x20], #0x8\n" + "fmla v17.8h, v27.8h, v23.8h\n" + "add x26, x26, x11\n" + "fmla v16.8h, v26.8h, v23.8h\n" + "ldr q23, [%x[params], #0x0]\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "add %x[params], %x[params], #0x10\n" + "tbz %x[n_channels], #2, 17f\n" + "ldr d2, [x9], #0x8\n" + "ldr d1, [x28], #0x8\n" + "ldr d0, [x27], #0x8\n" + "ldr d31, [x26], #0x8\n" + "ldr d30, [x25], #0x8\n" + "ldr d29, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v2.s }[2], [x9], #0x4\n" + "ld1 { v1.s }[2], [x28], #0x4\n" + "ld1 { v0.s }[2], [x27], #0x4\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v30.s }[2], [x25], #0x4\n" + "ld1 { v29.s }[2], [x24], #0x4\n" + "ld1 { v28.s }[2], [x23], #0x4\n" + "ld1 { v27.s }[2], [x22], #0x4\n" + "ld1 { v26.s }[2], [x21], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v2.h }[6], [x9], #0x2\n" + "ld1 { v1.h }[6], [x28], #0x2\n" + "ld1 { v0.h }[6], [x27], #0x2\n" + "ld1 { v31.h }[6], [x26], #0x2\n" + "ld1 { v30.h }[6], [x25], #0x2\n" + "ld1 { v29.h }[6], [x24], #0x2\n" + "ld1 { v28.h }[6], [x23], #0x2\n" + "ld1 { v27.h }[6], [x22], #0x2\n" + "ld1 { v26.h }[6], [x21], #0x2\n" + "b 19f\n" + "16:" // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v2.h }[4], [x9], #0x2\n" + "ld1 { v1.h }[4], [x28], #0x2\n" + "ld1 { v0.h }[4], [x27], #0x2\n" + "ld1 { v31.h }[4], [x26], #0x2\n" + "ld1 { v30.h }[4], [x25], #0x2\n" + "ld1 { v29.h }[4], [x24], #0x2\n" + "ld1 { v28.h }[4], [x23], #0x2\n" + "ld1 { v27.h }[4], [x22], #0x2\n" + "ld1 { v26.h }[4], [x21], #0x2\n" + "b 19f\n" + "17:" // Oddments: Planar loop: Load: Bit 2: Unset + "tbz %x[n_channels], #1, 18f\n" + "ldr s2, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s0, [x27], #0x4\n" + "ldr s31, [x26], #0x4\n" + "ldr s30, [x25], #0x4\n" + "ldr s29, [x24], #0x4\n" + "ldr s28, [x23], #0x4\n" + "ldr s27, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v2.h }[2], [x9], #0x2\n" + "ld1 { v1.h }[2], [x28], #0x2\n" + "ld1 { v0.h }[2], [x27], #0x2\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v30.h }[2], [x25], #0x2\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "b 19f\n" + "18:" // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 19f\n" + "ldr h2, [x9], #0x2\n" + "ldr h1, [x28], #0x2\n" + "ldr h0, [x27], #0x2\n" + "ldr h31, [x26], #0x2\n" + "ldr h30, [x25], #0x2\n" + "ldr h29, [x24], #0x2\n" + "ldr h28, [x23], #0x2\n" + "ldr h27, [x22], #0x2\n" + "ldr h26, [x21], #0x2\n" + "19:" // Oddments: Planar loop: Load: Bit 2: End + "subs x19, x19, #0x1\n" + "bgt 15b\n" + "20:" // Oddments: Planar tail + "fmla v25.8h, v2.8h, v23.8h\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "add x27, x27, x11\n" + "fmla v24.8h, v1.8h, v23.8h\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "fmla v22.8h, v0.8h, v23.8h\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "add x26, x26, x11\n" + "fmla v21.8h, v31.8h, v23.8h\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "fmla v20.8h, v30.8h, v23.8h\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x25, x25, x11\n" + "fmla v19.8h, v29.8h, v23.8h\n" + "add x24, x24, x11\n" + "fmla v18.8h, v28.8h, v23.8h\n" + "add x23, x23, x11\n" + "fmla v17.8h, v27.8h, v23.8h\n" + "add x22, x22, x11\n" + "fmla v16.8h, v26.8h, v23.8h\n" + "add x21, x21, x11\n" + "fmax v25.8h, v25.8h, v4.8h\n" + "add x20, x20, x11\n" + "fmax v24.8h, v24.8h, v4.8h\n" + "add x19, x19, x11\n" + "fmax v22.8h, v22.8h, v4.8h\n" + "fmin v25.8h, v25.8h, v3.8h\n" + "fmin v24.8h, v24.8h, v3.8h\n" + "fmin v22.8h, v22.8h, v3.8h\n" + "fmax v21.8h, v21.8h, v4.8h\n" + "fmax v20.8h, v20.8h, v4.8h\n" + "fmax v19.8h, v19.8h, v4.8h\n" + "fmin v21.8h, v21.8h, v3.8h\n" + "fmin v20.8h, v20.8h, v3.8h\n" + "fmin v19.8h, v19.8h, v3.8h\n" + "fmax v18.8h, v18.8h, v4.8h\n" + "fmax v17.8h, v17.8h, v4.8h\n" + "fmax v16.8h, v16.8h, v4.8h\n" + "fmin v18.8h, v18.8h, v3.8h\n" + "fmin v17.8h, v17.8h, v3.8h\n" + "fmin v16.8h, v16.8h, v3.8h\n" + "tbz %x[n_channels], #2, 22f\n" + "st1 { v25.d }[0], [x27], #0x8\n" + "st1 { v24.d }[0], [x26], #0x8\n" + "st1 { v22.d }[0], [x25], #0x8\n" + "st1 { v21.d }[0], [x24], #0x8\n" + "st1 { v20.d }[0], [x23], #0x8\n" + "st1 { v19.d }[0], [x22], #0x8\n" + "st1 { v18.d }[0], [x21], #0x8\n" + "st1 { v17.d }[0], [x20], #0x8\n" + "st1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #1, 21f\n" + "st1 { v25.s }[2], [x27], #0x4\n" + "st1 { v24.s }[2], [x26], #0x4\n" + "st1 { v22.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v19.s }[2], [x22], #0x4\n" + "st1 { v18.s }[2], [x21], #0x4\n" + "st1 { v17.s }[2], [x20], #0x4\n" + "st1 { v16.s }[2], [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "st1 { v25.h }[6], [x27], #0x2\n" + "st1 { v24.h }[6], [x26], #0x2\n" + "st1 { v22.h }[6], [x25], #0x2\n" + "st1 { v21.h }[6], [x24], #0x2\n" + "st1 { v20.h }[6], [x23], #0x2\n" + "st1 { v19.h }[6], [x22], #0x2\n" + "st1 { v18.h }[6], [x21], #0x2\n" + "st1 { v17.h }[6], [x20], #0x2\n" + "st1 { v16.h }[6], [x19], #0x2\n" + "b 24f\n" + "21:" // Oddments: Store: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 24f\n" + "st1 { v25.h }[4], [x27], #0x2\n" + "st1 { v24.h }[4], [x26], #0x2\n" + "st1 { v22.h }[4], [x25], #0x2\n" + "st1 { v21.h }[4], [x24], #0x2\n" + "st1 { v20.h }[4], [x23], #0x2\n" + "st1 { v19.h }[4], [x22], #0x2\n" + "st1 { v18.h }[4], [x21], #0x2\n" + "st1 { v17.h }[4], [x20], #0x2\n" + "st1 { v16.h }[4], [x19], #0x2\n" + "b 24f\n" + "22:" // Oddments: Store: Bit 2: Unset + "tbz %x[n_channels], #1, 23f\n" + "st1 { v25.s }[0], [x27], #0x4\n" + "st1 { v24.s }[0], [x26], #0x4\n" + "st1 { v22.s }[0], [x25], #0x4\n" + "st1 { v21.s }[0], [x24], #0x4\n" + "st1 { v20.s }[0], [x23], #0x4\n" + "st1 { v19.s }[0], [x22], #0x4\n" + "st1 { v18.s }[0], [x21], #0x4\n" + "st1 { v17.s }[0], [x20], #0x4\n" + "st1 { v16.s }[0], [x19], #0x4\n" + "tbz %x[n_channels], #0, 24f\n" + "st1 { v25.h }[2], [x27], #0x2\n" + "st1 { v24.h }[2], [x26], #0x2\n" + "st1 { v22.h }[2], [x25], #0x2\n" + "st1 { v21.h }[2], [x24], #0x2\n" + "st1 { v20.h }[2], [x23], #0x2\n" + "st1 { v19.h }[2], [x22], #0x2\n" + "st1 { v18.h }[2], [x21], #0x2\n" + "st1 { v17.h }[2], [x20], #0x2\n" + "st1 { v16.h }[2], [x19], #0x2\n" + "b 24f\n" + "23:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 24f\n" + "st1 { v25.h }[0], [x27], #0x2\n" + "st1 { v24.h }[0], [x26], #0x2\n" + "st1 { v22.h }[0], [x25], #0x2\n" + "st1 { v21.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x23], #0x2\n" + "st1 { v19.h }[0], [x22], #0x2\n" + "st1 { v18.h }[0], [x21], #0x2\n" + "st1 { v17.h }[0], [x20], #0x2\n" + "st1 { v16.h }[0], [x19], #0x2\n" + "24:" // Oddments: Store: Bit 2: End + + "25:" // End + + : [params] "+&r" (params) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp new file mode 100644 index 0000000000..a02a2b2984 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16); + +struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int output_rows(void) { return 2; }; + constexpr static unsigned int output_cols(void) { return 8; }; + + constexpr static unsigned int output_col_regs(void) { return 1; }; + + kern_type kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + + a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..7ed7c52db2 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -0,0 +1,1049 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +namespace arm_conv { +namespace depthwise { + +void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl( + const __fp16 *const *const inptrs, + __fp16 *const *const outptrs, + const __fp16 *weights, + const __fp16 *bias, + const unsigned int kernel_points, + const unsigned int n_output_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + const __fp16 minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ld1r { v7.8h }, [%x[minmax_vals]]\n" + "mov x10, #0x0\n" + "add x19, %x[minmax_vals], #0x2\n" + "ld1r { v6.8h }, [x19]\n" + "lsr x9, %x[n_output_channels], #0x3\n" + "cbz x9, 8f\n" + "1:" // Output channel loop + "movi v16.16b, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x10, #0x1\n" + "ldr q16, [%x[bias], x19]\n" + "2:" // Output channel loop: Load bias: Done + "mov v5.16b, v16.16b\n" + "ldr q4, [%x[weights], #0x0]\n" + "mov x19, %x[inptrs]\n" + "mov v31.16b, v16.16b\n" + "ldp x25, x28, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "mov v30.16b, v16.16b\n" + "ldr q3, [x25, #0x0]\n" + "mov v29.16b, v16.16b\n" + "add %x[weights], %x[weights], #0x10\n" + "mov v28.16b, v16.16b\n" + "ldr q2, [x28, #0x0]\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "mov v18.16b, v16.16b\n" + "mov v17.16b, v16.16b\n" + "cbz x20, 6f\n" + "ldp x25, x28, [x19], #0x10\n" + "ldr q16, [%x[weights], #0x0]\n" + "subs x20, x20, #0x1\n" + "add %x[weights], %x[weights], #0x10\n" + "ldr q1, [x25, #0x0]\n" + "ldr q0, [x28, #0x0]\n" + "beq 4f\n" + "3:" // Output channel loop: Kernel loop + "fmla v5.8h, v4.8h, v3.h[0]\n" + "ldp x25, x28, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "ldr q3, [x25, #0x0]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "ldr q2, [x28, #0x0]\n" + "fmla v5.8h, v16.8h, v1.h[0]\n" + "ldr q4, [%x[weights], #0x0]\n" + "fmla v31.8h, v16.8h, v1.h[1]\n" + "ldp x25, x28, [x19], #0x10\n" + "fmla v30.8h, v16.8h, v1.h[2]\n" + "fmla v29.8h, v16.8h, v1.h[3]\n" + "fmla v28.8h, v16.8h, v1.h[4]\n" + "fmla v27.8h, v16.8h, v1.h[5]\n" + "fmla v26.8h, v16.8h, v1.h[6]\n" + "fmla v25.8h, v16.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v24.8h, v16.8h, v0.h[0]\n" + "fmla v23.8h, v16.8h, v0.h[1]\n" + "fmla v22.8h, v16.8h, v0.h[2]\n" + "fmla v21.8h, v16.8h, v0.h[3]\n" + "fmla v20.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v0.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v17.8h, v16.8h, v0.h[7]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q16, [%x[weights], #0x10]\n" + "add %x[weights], %x[weights], #0x20\n" + "bgt 3b\n" + "4:" // Output channel loop: Kernel loop tail + "tbnz %x[kernel_points], #0, 5f\n" + "fmla v5.8h, v4.8h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "lsl x27, x10, #0x1\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "fmla v5.8h, v16.8h, v1.h[0]\n" + "fmla v31.8h, v16.8h, v1.h[1]\n" + "fmla v30.8h, v16.8h, v1.h[2]\n" + "fmla v29.8h, v16.8h, v1.h[3]\n" + "fmla v28.8h, v16.8h, v1.h[4]\n" + "fmla v27.8h, v16.8h, v1.h[5]\n" + "fmla v26.8h, v16.8h, v1.h[6]\n" + "fmla v25.8h, v16.8h, v1.h[7]\n" + "fmla v24.8h, v16.8h, v0.h[0]\n" + "fmla v23.8h, v16.8h, v0.h[1]\n" + "fmla v22.8h, v16.8h, v0.h[2]\n" + "fmla v21.8h, v16.8h, v0.h[3]\n" + "fmla v20.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v0.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v17.8h, v16.8h, v0.h[7]\n" + "fmin v5.8h, v5.8h, v6.8h\n" + "fmin v31.8h, v31.8h, v6.8h\n" + "fmin v30.8h, v30.8h, v6.8h\n" + "fmax v5.8h, v5.8h, v7.8h\n" + "str q5, [x19, x27]\n" + "fmax v31.8h, v31.8h, v7.8h\n" + "fmax v30.8h, v30.8h, v7.8h\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmin v29.8h, v29.8h, v6.8h\n" + "str q31, [x20, x27]\n" + "fmin v28.8h, v28.8h, v6.8h\n" + "fmin v27.8h, v27.8h, v6.8h\n" + "str q30, [x21, x27]\n" + "fmax v29.8h, v29.8h, v7.8h\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin v26.8h, v26.8h, v6.8h\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax v28.8h, v28.8h, v7.8h\n" + "str q29, [x22, x27]\n" + "fmax v27.8h, v27.8h, v7.8h\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax v26.8h, v26.8h, v7.8h\n" + "str q28, [x23, x27]\n" + "fmin v25.8h, v25.8h, v6.8h\n" + "str q27, [x24, x27]\n" + "fmin v24.8h, v24.8h, v6.8h\n" + "str q26, [x25, x27]\n" + "fmin v23.8h, v23.8h, v6.8h\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmax v25.8h, v25.8h, v7.8h\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmax v24.8h, v24.8h, v7.8h\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax v23.8h, v23.8h, v7.8h\n" + "str q25, [x26, x27]\n" + "fmin v22.8h, v22.8h, v6.8h\n" + "str q24, [x19, x27]\n" + "fmin v21.8h, v21.8h, v6.8h\n" + "str q23, [x20, x27]\n" + "fmin v20.8h, v20.8h, v6.8h\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax v22.8h, v22.8h, v7.8h\n" + "str q22, [x21, x27]\n" + "fmax v21.8h, v21.8h, v7.8h\n" + "fmax v20.8h, v20.8h, v7.8h\n" + "str q21, [x22, x27]\n" + "fmin v19.8h, v19.8h, v6.8h\n" + "fmin v18.8h, v18.8h, v6.8h\n" + "str q20, [x23, x27]\n" + "fmin v17.8h, v17.8h, v6.8h\n" + "fmax v19.8h, v19.8h, v7.8h\n" + "str q19, [x24, x27]\n" + "fmax v18.8h, v18.8h, v7.8h\n" + "fmax v17.8h, v17.8h, v7.8h\n" + "str q18, [x25, x27]\n" + "str q17, [x26, x27]\n" + "b 7f\n" + "5:" // Output channel loop: Odd tail + "fmla v5.8h, v4.8h, v3.h[0]\n" + "ldp x25, x28, [x19], #0x10\n" + "lsl x27, x10, #0x1\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "ldr q3, [x25, #0x0]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "ldr q2, [x28, #0x0]\n" + "fmla v5.8h, v16.8h, v1.h[0]\n" + "ldr q4, [%x[weights], #0x0]\n" + "add %x[weights], %x[weights], #0x10\n" + "fmla v31.8h, v16.8h, v1.h[1]\n" + "fmla v30.8h, v16.8h, v1.h[2]\n" + "fmla v29.8h, v16.8h, v1.h[3]\n" + "fmla v28.8h, v16.8h, v1.h[4]\n" + "fmla v27.8h, v16.8h, v1.h[5]\n" + "fmla v26.8h, v16.8h, v1.h[6]\n" + "fmla v25.8h, v16.8h, v1.h[7]\n" + "fmla v24.8h, v16.8h, v0.h[0]\n" + "fmla v23.8h, v16.8h, v0.h[1]\n" + "fmla v22.8h, v16.8h, v0.h[2]\n" + "fmla v21.8h, v16.8h, v0.h[3]\n" + "fmla v20.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v0.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v17.8h, v16.8h, v0.h[7]\n" + "fmla v5.8h, v4.8h, v3.h[0]\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "fmin v5.8h, v5.8h, v6.8h\n" + "fmin v31.8h, v31.8h, v6.8h\n" + "fmin v30.8h, v30.8h, v6.8h\n" + "fmax v5.8h, v5.8h, v7.8h\n" + "str q5, [x19, x27]\n" + "fmax v31.8h, v31.8h, v7.8h\n" + "fmax v30.8h, v30.8h, v7.8h\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmin v29.8h, v29.8h, v6.8h\n" + "str q31, [x20, x27]\n" + "fmin v28.8h, v28.8h, v6.8h\n" + "fmin v27.8h, v27.8h, v6.8h\n" + "str q30, [x21, x27]\n" + "fmax v29.8h, v29.8h, v7.8h\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin v26.8h, v26.8h, v6.8h\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax v28.8h, v28.8h, v7.8h\n" + "str q29, [x22, x27]\n" + "fmax v27.8h, v27.8h, v7.8h\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax v26.8h, v26.8h, v7.8h\n" + "str q28, [x23, x27]\n" + "fmin v25.8h, v25.8h, v6.8h\n" + "str q27, [x24, x27]\n" + "fmin v24.8h, v24.8h, v6.8h\n" + "str q26, [x25, x27]\n" + "fmin v23.8h, v23.8h, v6.8h\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmax v25.8h, v25.8h, v7.8h\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmax v24.8h, v24.8h, v7.8h\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax v23.8h, v23.8h, v7.8h\n" + "str q25, [x26, x27]\n" + "fmin v22.8h, v22.8h, v6.8h\n" + "str q24, [x19, x27]\n" + "fmin v21.8h, v21.8h, v6.8h\n" + "str q23, [x20, x27]\n" + "fmin v20.8h, v20.8h, v6.8h\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax v22.8h, v22.8h, v7.8h\n" + "str q22, [x21, x27]\n" + "fmax v21.8h, v21.8h, v7.8h\n" + "fmax v20.8h, v20.8h, v7.8h\n" + "str q21, [x22, x27]\n" + "fmin v19.8h, v19.8h, v6.8h\n" + "fmin v18.8h, v18.8h, v6.8h\n" + "str q20, [x23, x27]\n" + "fmin v17.8h, v17.8h, v6.8h\n" + "fmax v19.8h, v19.8h, v7.8h\n" + "str q19, [x24, x27]\n" + "fmax v18.8h, v18.8h, v7.8h\n" + "fmax v17.8h, v17.8h, v7.8h\n" + "str q18, [x25, x27]\n" + "str q17, [x26, x27]\n" + "b 7f\n" + "6:" // Output channel loop: Single kernel point + "fmla v5.8h, v4.8h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "lsl x27, x10, #0x1\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "fmin v5.8h, v5.8h, v6.8h\n" + "fmin v31.8h, v31.8h, v6.8h\n" + "fmin v30.8h, v30.8h, v6.8h\n" + "fmax v5.8h, v5.8h, v7.8h\n" + "str q5, [x19, x27]\n" + "fmax v31.8h, v31.8h, v7.8h\n" + "fmax v30.8h, v30.8h, v7.8h\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmin v29.8h, v29.8h, v6.8h\n" + "str q31, [x20, x27]\n" + "fmin v28.8h, v28.8h, v6.8h\n" + "fmin v27.8h, v27.8h, v6.8h\n" + "str q30, [x21, x27]\n" + "fmax v29.8h, v29.8h, v7.8h\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin v26.8h, v26.8h, v6.8h\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax v28.8h, v28.8h, v7.8h\n" + "str q29, [x22, x27]\n" + "fmax v27.8h, v27.8h, v7.8h\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax v26.8h, v26.8h, v7.8h\n" + "str q28, [x23, x27]\n" + "fmin v25.8h, v25.8h, v6.8h\n" + "str q27, [x24, x27]\n" + "fmin v24.8h, v24.8h, v6.8h\n" + "str q26, [x25, x27]\n" + "fmin v23.8h, v23.8h, v6.8h\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmax v25.8h, v25.8h, v7.8h\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmax v24.8h, v24.8h, v7.8h\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax v23.8h, v23.8h, v7.8h\n" + "str q25, [x26, x27]\n" + "fmin v22.8h, v22.8h, v6.8h\n" + "str q24, [x19, x27]\n" + "fmin v21.8h, v21.8h, v6.8h\n" + "str q23, [x20, x27]\n" + "fmin v20.8h, v20.8h, v6.8h\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax v22.8h, v22.8h, v7.8h\n" + "str q22, [x21, x27]\n" + "fmax v21.8h, v21.8h, v7.8h\n" + "fmax v20.8h, v20.8h, v7.8h\n" + "str q21, [x22, x27]\n" + "fmin v19.8h, v19.8h, v6.8h\n" + "fmin v18.8h, v18.8h, v6.8h\n" + "str q20, [x23, x27]\n" + "fmin v17.8h, v17.8h, v6.8h\n" + "fmax v19.8h, v19.8h, v7.8h\n" + "str q19, [x24, x27]\n" + "fmax v18.8h, v18.8h, v7.8h\n" + "fmax v17.8h, v17.8h, v7.8h\n" + "str q18, [x25, x27]\n" + "str q17, [x26, x27]\n" + "7:" // Output channel loop: Done + "add x10, x10, #0x8\n" + "cmp x10, x9, LSL #3\n" + "blt 1b\n" + "tst %x[n_output_channels], #0x7\n" + "beq 23f\n" + "8:" // Output channel oddments + "movi v16.16b, #0x0\n" + "cbz %x[bias], 13f\n" + "add x19, %x[bias], x10, LSL #1\n" + "tbz %x[n_output_channels], #2, 10f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #1, 9f\n" + "ld1 { v16.s }[2], [x19], #0x4\n" + "tbz %x[n_output_channels], #0, 12f\n" + "ld1 { v16.h }[6], [x19]\n" + "b 12f\n" + "9:" // Output channel oddments: Load bias: Bit 2: Bit 1: Unset + "tbz %x[n_output_channels], #0, 12f\n" + "ld1 { v16.h }[4], [x19]\n" + "b 12f\n" + "10:" // Output channel oddments: Load bias: Bit 2: Unset + "tbz %x[n_output_channels], #1, 11f\n" + "ld1 { v16.s }[0], [x19], #0x4\n" + "tbz %x[n_output_channels], #0, 12f\n" + "ld1 { v16.h }[2], [x19]\n" + "b 12f\n" + "11:" // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_output_channels], #0, 12f\n" + "ld1 { v16.h }[0], [x19]\n" + "12:" // Output channel oddments: Load bias: Bit 2: End + + "13:" // Output channel oddments: Load bias: Done + "mov v5.16b, v16.16b\n" + "ldr q4, [%x[weights], #0x0]\n" + "mov x19, %x[inptrs]\n" + "mov v31.16b, v16.16b\n" + "ldp x25, x28, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "mov v30.16b, v16.16b\n" + "ldr q3, [x25, #0x0]\n" + "mov v29.16b, v16.16b\n" + "add %x[weights], %x[weights], #0x10\n" + "mov v28.16b, v16.16b\n" + "ldr q2, [x28, #0x0]\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "mov v18.16b, v16.16b\n" + "mov v17.16b, v16.16b\n" + "cbz x20, 17f\n" + "ldp x25, x28, [x19], #0x10\n" + "ldr q16, [%x[weights], #0x0]\n" + "subs x20, x20, #0x1\n" + "add %x[weights], %x[weights], #0x10\n" + "ldr q1, [x25, #0x0]\n" + "ldr q0, [x28, #0x0]\n" + "beq 15f\n" + "14:" // Output channel oddments: Kernel loop + "fmla v5.8h, v4.8h, v3.h[0]\n" + "ldp x25, x28, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "ldr q3, [x25, #0x0]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "ldr q2, [x28, #0x0]\n" + "fmla v5.8h, v16.8h, v1.h[0]\n" + "ldr q4, [%x[weights], #0x0]\n" + "fmla v31.8h, v16.8h, v1.h[1]\n" + "ldp x25, x28, [x19], #0x10\n" + "fmla v30.8h, v16.8h, v1.h[2]\n" + "fmla v29.8h, v16.8h, v1.h[3]\n" + "fmla v28.8h, v16.8h, v1.h[4]\n" + "fmla v27.8h, v16.8h, v1.h[5]\n" + "fmla v26.8h, v16.8h, v1.h[6]\n" + "fmla v25.8h, v16.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v24.8h, v16.8h, v0.h[0]\n" + "fmla v23.8h, v16.8h, v0.h[1]\n" + "fmla v22.8h, v16.8h, v0.h[2]\n" + "fmla v21.8h, v16.8h, v0.h[3]\n" + "fmla v20.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v0.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v17.8h, v16.8h, v0.h[7]\n" + "ldr q0, [x28, #0x0]\n" + "ldr q16, [%x[weights], #0x10]\n" + "add %x[weights], %x[weights], #0x20\n" + "bgt 14b\n" + "15:" // Output channel oddments: Kernel loop tail + "tbnz %x[kernel_points], #0, 16f\n" + "fmla v5.8h, v4.8h, v3.h[0]\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "fmla v5.8h, v16.8h, v1.h[0]\n" + "fmla v31.8h, v16.8h, v1.h[1]\n" + "fmla v30.8h, v16.8h, v1.h[2]\n" + "fmla v29.8h, v16.8h, v1.h[3]\n" + "fmla v28.8h, v16.8h, v1.h[4]\n" + "fmla v27.8h, v16.8h, v1.h[5]\n" + "fmla v26.8h, v16.8h, v1.h[6]\n" + "fmla v25.8h, v16.8h, v1.h[7]\n" + "fmla v24.8h, v16.8h, v0.h[0]\n" + "fmla v23.8h, v16.8h, v0.h[1]\n" + "fmla v22.8h, v16.8h, v0.h[2]\n" + "fmla v21.8h, v16.8h, v0.h[3]\n" + "fmla v20.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v0.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v17.8h, v16.8h, v0.h[7]\n" + "b 18f\n" + "16:" // Output channel oddments: Odd tail + "fmla v5.8h, v4.8h, v3.h[0]\n" + "ldp x25, x28, [x19], #0x10\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "ldr q3, [x25, #0x0]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "ldr q2, [x28, #0x0]\n" + "fmla v5.8h, v16.8h, v1.h[0]\n" + "ldr q4, [%x[weights], #0x0]\n" + "add %x[weights], %x[weights], #0x10\n" + "fmla v31.8h, v16.8h, v1.h[1]\n" + "fmla v30.8h, v16.8h, v1.h[2]\n" + "fmla v29.8h, v16.8h, v1.h[3]\n" + "fmla v28.8h, v16.8h, v1.h[4]\n" + "fmla v27.8h, v16.8h, v1.h[5]\n" + "fmla v26.8h, v16.8h, v1.h[6]\n" + "fmla v25.8h, v16.8h, v1.h[7]\n" + "fmla v24.8h, v16.8h, v0.h[0]\n" + "fmla v23.8h, v16.8h, v0.h[1]\n" + "fmla v22.8h, v16.8h, v0.h[2]\n" + "fmla v21.8h, v16.8h, v0.h[3]\n" + "fmla v20.8h, v16.8h, v0.h[4]\n" + "fmla v19.8h, v16.8h, v0.h[5]\n" + "fmla v18.8h, v16.8h, v0.h[6]\n" + "fmla v17.8h, v16.8h, v0.h[7]\n" + "fmla v5.8h, v4.8h, v3.h[0]\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "b 18f\n" + "17:" // Output channel oddments: Single kernel point + "fmla v5.8h, v4.8h, v3.h[0]\n" + "fmla v31.8h, v4.8h, v3.h[1]\n" + "fmla v30.8h, v4.8h, v3.h[2]\n" + "fmla v29.8h, v4.8h, v3.h[3]\n" + "fmla v28.8h, v4.8h, v3.h[4]\n" + "fmla v27.8h, v4.8h, v3.h[5]\n" + "fmla v26.8h, v4.8h, v3.h[6]\n" + "fmla v25.8h, v4.8h, v3.h[7]\n" + "fmla v24.8h, v4.8h, v2.h[0]\n" + "fmla v23.8h, v4.8h, v2.h[1]\n" + "fmla v22.8h, v4.8h, v2.h[2]\n" + "fmla v21.8h, v4.8h, v2.h[3]\n" + "fmla v20.8h, v4.8h, v2.h[4]\n" + "fmla v19.8h, v4.8h, v2.h[5]\n" + "fmla v18.8h, v4.8h, v2.h[6]\n" + "fmla v17.8h, v4.8h, v2.h[7]\n" + "18:" // Output channel oddments: Done + "fmin v5.8h, v5.8h, v6.8h\n" + "fmin v31.8h, v31.8h, v6.8h\n" + "fmin v30.8h, v30.8h, v6.8h\n" + "fmin v29.8h, v29.8h, v6.8h\n" + "fmax v5.8h, v5.8h, v7.8h\n" + "fmax v31.8h, v31.8h, v7.8h\n" + "fmax v30.8h, v30.8h, v7.8h\n" + "fmax v29.8h, v29.8h, v7.8h\n" + "fmin v28.8h, v28.8h, v6.8h\n" + "fmin v27.8h, v27.8h, v6.8h\n" + "fmin v26.8h, v26.8h, v6.8h\n" + "fmax v28.8h, v28.8h, v7.8h\n" + "fmax v27.8h, v27.8h, v7.8h\n" + "fmax v26.8h, v26.8h, v7.8h\n" + "fmin v25.8h, v25.8h, v6.8h\n" + "fmin v24.8h, v24.8h, v6.8h\n" + "fmin v23.8h, v23.8h, v6.8h\n" + "fmax v25.8h, v25.8h, v7.8h\n" + "fmax v24.8h, v24.8h, v7.8h\n" + "fmax v23.8h, v23.8h, v7.8h\n" + "fmin v22.8h, v22.8h, v6.8h\n" + "fmin v21.8h, v21.8h, v6.8h\n" + "fmin v20.8h, v20.8h, v6.8h\n" + "fmax v22.8h, v22.8h, v7.8h\n" + "fmax v21.8h, v21.8h, v7.8h\n" + "fmax v20.8h, v20.8h, v7.8h\n" + "fmin v19.8h, v19.8h, v6.8h\n" + "fmin v18.8h, v18.8h, v6.8h\n" + "fmin v17.8h, v17.8h, v6.8h\n" + "fmax v19.8h, v19.8h, v7.8h\n" + "fmax v18.8h, v18.8h, v7.8h\n" + "fmax v17.8h, v17.8h, v7.8h\n" + "tbz %x[n_output_channels], #2, 20f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.d }[0], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.d }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.d }[0], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.d }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.d }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.d }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.d }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.d }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.d }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.d }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.d }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.d }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.d }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.d }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.d }[0], [x25]\n" + "add x10, x10, #0x4\n" + "st1 { v17.d }[0], [x26]\n" + "tbz %x[n_output_channels], #1, 19f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.s }[2], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.s }[2], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.s }[2], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.s }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.s }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.s }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.s }[2], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.s }[2], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.s }[2], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.s }[2], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.s }[2], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.s }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.s }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.s }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.s }[2], [x25]\n" + "add x10, x10, #0x2\n" + "st1 { v17.s }[2], [x26]\n" + "tbz %x[n_output_channels], #0, 22f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.h }[6], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.h }[6], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.h }[6], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.h }[6], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.h }[6], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.h }[6], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.h }[6], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.h }[6], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.h }[6], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.h }[6], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.h }[6], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.h }[6], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.h }[6], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.h }[6], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.h }[6], [x25]\n" + "st1 { v17.h }[6], [x26]\n" + "b 22f\n" + "19:" // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset + "tbz %x[n_output_channels], #0, 22f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.h }[4], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.h }[4], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.h }[4], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.h }[4], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.h }[4], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.h }[4], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.h }[4], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.h }[4], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.h }[4], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.h }[4], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.h }[4], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.h }[4], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.h }[4], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.h }[4], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.h }[4], [x25]\n" + "st1 { v17.h }[4], [x26]\n" + "b 22f\n" + "20:" // Output channel oddments: Done: Store: Bit 2: Unset + "tbz %x[n_output_channels], #1, 21f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.s }[0], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.s }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.s }[0], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.s }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.s }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.s }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.s }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.s }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.s }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.s }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.s }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.s }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.s }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.s }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.s }[0], [x25]\n" + "add x10, x10, #0x2\n" + "st1 { v17.s }[0], [x26]\n" + "tbz %x[n_output_channels], #0, 22f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.h }[2], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.h }[2], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.h }[2], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.h }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.h }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.h }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.h }[2], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.h }[2], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.h }[2], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.h }[2], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.h }[2], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.h }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.h }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.h }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.h }[2], [x25]\n" + "st1 { v17.h }[2], [x26]\n" + "b 22f\n" + "21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_output_channels], #0, 22f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #1\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v5.h }[0], [x19]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v31.h }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v30.h }[0], [x21]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v29.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v28.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v27.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v26.h }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #1\n" + "st1 { v25.h }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #1\n" + "st1 { v24.h }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #1\n" + "st1 { v23.h }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #1\n" + "st1 { v22.h }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #1\n" + "st1 { v21.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #1\n" + "st1 { v20.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #1\n" + "st1 { v19.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #1\n" + "st1 { v18.h }[0], [x25]\n" + "st1 { v17.h }[0], [x26]\n" + "22:" // Output channel oddments: Done: Store: Bit 2: End + + "23:" // Done + + : [weights] "+&r" (weights) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..88f20bb125 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + + a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..fae208fbab --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x17, #0x0\n" + "mov x16, #0x0\n" + "1:" // Tile loop + "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x25, #0x2\n" + "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x15, #0x2\n" + "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x22, #0x0\n" + "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x17, x23\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col + "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x25\n" // offset *= kernel_stride * output_size + "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x12, x12, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1r { v18.4s }, [x24]\n" + "add x9, x12, x23, LSL #2\n" + "ld1r { v17.4s }, [x21]\n" + "add x28, x9, x23, LSL #2\n" + "lsl x13, x13, #0x2\n" + "add x27, x28, x23, LSL #2\n" + "add x26, x13, x13\n" + "add x25, x26, x13\n" + "mul x19, x17, x20\n" // offset = tile_i * ld_output_row + "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x15\n" // offset *= output_tile_size + "add x10, x10, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x24, x10, x20, LSL #2\n" + "lsl x11, x11, #0x2\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x2\n" + "cbz x19, 4f\n" + "ldr q16, [x14, #0x0]\n" + "ldr q0, [x14, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x14, #0x20]\n" + "ldr q2, [x14, #0x30]\n" + "ldr q3, [x14, #0x40]\n" + "ldr q4, [x14, #0x50]\n" + "ldr q5, [x14, #0x60]\n" + "ldr q6, [x14, #0x70]\n" + "ldr q7, [x14, #0x80]\n" + "ldr q8, [x14, #0x90]\n" + "add x14, x14, #0xa0\n" + "ldr q9, [x9, x13]\n" + "ld1 { v10.4s }, [x12]\n" + "ldr q11, [x12, x25]\n" + "ldr q12, [x9, x26]\n" + "ldr q13, [x28, x13]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n" + "add x22, x22, #0x10\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n" + "add x21, x21, #0x10\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ld1 { v9.4s }, [x27]\n" + "cmp x21, x19, LSL #4\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x28, x26]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "ldr q11, [x27, x25]\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "ldr q16, [x14, #0x0]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v31.4s, v5.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v12.4s\n" + "ldr q12, [x12, x13]\n" + "fmla v29.4s, v6.4s, v9.4s\n" + "ldr q9, [x12, x26]\n" + "add x12, x12, #0x10\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "fmla v31.4s, v7.4s, v13.4s\n" + "fmla v30.4s, v6.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "ld1 { v11.4s }, [x9]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x9, x25]\n" + "add x9, x9, #0x10\n" + "fmla v29.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x14, #0x50]\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "ld1 { v9.4s }, [x28]\n" + "ldr q1, [x14, #0x20]\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q0, [x14, #0x10]\n" + "fmla v28.4s, v2.4s, v12.4s\n" + "ldr q2, [x14, #0x30]\n" + "fmla v31.4s, v8.4s, v10.4s\n" + "fmla v30.4s, v7.4s, v10.4s\n" + "ldr q10, [x28, x25]\n" + "add x28, x28, #0x10\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "ldr q13, [x28, x13]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x27, x13]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x27, x26]\n" + "add x27, x27, #0x10\n" + "fmla v28.4s, v5.4s, v10.4s\n" + "ldr q3, [x14, #0x40]\n" + "ldr q5, [x14, #0x60]\n" + "fmla v31.4s, v6.4s, v9.4s\n" + "ldr q9, [x9, x13]\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "ld1 { v10.4s }, [x12]\n" + "fmla v29.4s, v7.4s, v11.4s\n" + "fmla v28.4s, v6.4s, v11.4s\n" + "ldr q11, [x12, x25]\n" + "ldr q6, [x14, #0x70]\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "ldr q8, [x14, #0x90]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "ldr q12, [x9, x26]\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "ldr q7, [x14, #0x80]\n" + "add x14, x14, #0xa0\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "st1 { v31.4s }, [x10]\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q30, [x10, x11]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "st1 { v29.4s }, [x24]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "add x10, x10, #0x10\n" + "str q28, [x24, x11]\n" + "add x24, x24, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ld1 { v9.4s }, [x27]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x28, x26]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "ldr q11, [x27, x25]\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v31.4s, v5.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v12.4s\n" + "ldr q12, [x12, x13]\n" + "fmla v29.4s, v6.4s, v9.4s\n" + "ldr q9, [x12, x26]\n" + "add x12, x12, #0x10\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "fmla v31.4s, v7.4s, v13.4s\n" + "fmla v30.4s, v6.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "ld1 { v11.4s }, [x9]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x9, x25]\n" + "add x9, x9, #0x10\n" + "fmla v29.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "ld1 { v9.4s }, [x28]\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "fmla v28.4s, v2.4s, v12.4s\n" + "fmla v31.4s, v8.4s, v10.4s\n" + "fmla v30.4s, v7.4s, v10.4s\n" + "ldr q10, [x28, x25]\n" + "add x28, x28, #0x10\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x27, x13]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x27, x26]\n" + "add x27, x27, #0x10\n" + "fmla v28.4s, v5.4s, v10.4s\n" + "fmla v31.4s, v6.4s, v9.4s\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "fmla v29.4s, v7.4s, v11.4s\n" + "fmla v28.4s, v6.4s, v11.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "st1 { v31.4s }, [x10]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "str q30, [x10, x11]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "add x10, x10, #0x10\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "st1 { v29.4s }, [x24]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q28, [x24, x11]\n" + "add x24, x24, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x3\n" + "beq 31f\n" + "ldr q16, [x14, #0x0]\n" + "ldr q0, [x14, #0x10]\n" + "add x23, x9, x13\n" + "ldr q1, [x14, #0x20]\n" + "add x22, x12, XZR\n" + "ldr q2, [x14, #0x30]\n" + "add x21, x12, x25\n" + "ldr q3, [x14, #0x40]\n" + "add x20, x9, x26\n" + "ldr q4, [x14, #0x50]\n" + "add x19, x28, x13\n" + "ldr q5, [x14, #0x60]\n" + "ldr q6, [x14, #0x70]\n" + "ldr q7, [x14, #0x80]\n" + "ldr q8, [x14, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr d9, [x23], #0x8\n" + "ldr d10, [x22], #0x8\n" + "ldr d11, [x21], #0x8\n" + "ldr d12, [x20], #0x8\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.s }[2], [x23]\n" + "ld1 { v10.s }[2], [x22]\n" + "ld1 { v11.s }[2], [x21]\n" + "ld1 { v12.s }[2], [x20]\n" + "ld1 { v13.s }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset + "ldr s9, [x23, #0x0]\n" + "ldr s10, [x22, #0x0]\n" + "ldr s11, [x21, #0x0]\n" + "ldr s12, [x20, #0x0]\n" + "ldr s13, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n" + "add x19, x27, XZR\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v31.4s, v5.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v29.4s, v6.4s, v9.4s\n" + "add x19, x27, x25\n" + "fmla v31.4s, v7.4s, v13.4s\n" + "fmla v30.4s, v6.4s, v13.4s\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v28.4s, v8.4s, v11.4s\n" + "add x19, x12, x13\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End + "fmla v31.4s, v1.4s, v12.4s\n" + "add x19, x12, x26\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End + "fmla v31.4s, v2.4s, v9.4s\n" + "add x19, x28, x26\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End + "fmla v31.4s, v8.4s, v10.4s\n" + "add x19, x9, XZR\n" + "fmla v30.4s, v7.4s, v10.4s\n" + "fmla v29.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End + "fmla v31.4s, v3.4s, v11.4s\n" + "add x19, x9, x25\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.4s, v5.4s, v12.4s\n" + "add x19, x28, XZR\n" + "fmla v28.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v9.4s\n" + "add x19, x28, x25\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v30.4s, v8.4s, v10.4s\n" + "add x19, x27, x13\n" + "fmla v28.4s, v5.4s, v10.4s\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v29.4s, v7.4s, v11.4s\n" + "add x19, x27, x26\n" + "fmla v28.4s, v6.4s, v11.4s\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v29.4s, v8.4s, v12.4s\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "tbz %x[n_channels], #1, 29f\n" + "mov x19, x10\n" + "st1 { v31.d }[0], [x19], x11\n" + "add x10, x10, #0x8\n" + "st1 { v30.d }[0], [x19]\n" + "mov x19, x24\n" + "st1 { v29.d }[0], [x19], x11\n" + "add x24, x24, #0x8\n" + "st1 { v28.d }[0], [x19]\n" + "tbz %x[n_channels], #0, 30f\n" + "mov x20, x10\n" + "st1 { v31.s }[2], [x20], x11\n" + "mov x19, x24\n" + "st1 { v30.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19], x11\n" + "st1 { v28.s }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x20, x10\n" + "st1 { v31.s }[0], [x20], x11\n" + "mov x19, x24\n" + "st1 { v30.s }[0], [x20]\n" + "st1 { v29.s }[0], [x19], x11\n" + "st1 { v28.s }[0], [x19]\n" + "30:" // Tile loop: Oddments: Store: Bit 1: End + + "31:" // Tile loop: End + "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x17, #0x1\n" + "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x16, x16, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x16, x19\n" + "csel x16, x16, XZR, LT\n" + "csel x17, x17, x21, LT\n" + "cmp x17, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..2f93a68c23 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[16]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[5]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[3]; + inptrs[3] = input_ptrs[6]; + inptrs[4] = input_ptrs[9]; + inptrs[5] = input_ptrs[12]; + inptrs[6] = input_ptrs[15]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[2]; + inptrs[9] = input_ptrs[10]; + inptrs[10] = input_ptrs[4]; + inptrs[11] = input_ptrs[7]; + inptrs[12] = input_ptrs[8]; + inptrs[13] = input_ptrs[11]; + inptrs[14] = input_ptrs[13]; + inptrs[15] = input_ptrs[14]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v18.4s }, [x20]\n" + "ld1r { v17.4s }, [x19]\n" + "mov x14, #0x0\n" + "ldp x13, x12, [x21, #0x0]\n" + "mov x11, #0x10\n" // cntb _, ALL, #1 + "ldp x10, x9, [x21, #0x10]\n" + "sub x28, XZR, x11\n" + "lsr x27, %x[n_channels], #0x2\n" + "cbz x27, 3f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x11, x27, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldr x22, [x16, #0x20]\n" + "ldr q9, [x26, x14]\n" + "ldr q10, [x25, x14]\n" + "ldr q11, [x24, x14]\n" + "ldr q12, [x23, x14]\n" + "ldr q13, [x22, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n" + "ldr x21, [x16, #0x28]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n" + "ldr x20, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n" + "ldr x19, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ldr q9, [x21, x14]\n" + "ldr x26, [x16, #0x40]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr x25, [x16, #0x48]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "ldr q11, [x20, x14]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q10, [x25, x14]\n" + "ldr x24, [x16, #0x50]\n" + "fmla v31.4s, v5.4s, v12.4s\n" + "ldr x23, [x16, #0x58]\n" + "fmla v30.4s, v4.4s, v12.4s\n" + "fmla v29.4s, v6.4s, v9.4s\n" + "ldr q12, [x19, x14]\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr q9, [x26, x14]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.4s, v7.4s, v13.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.4s, v6.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "ldr x20, [x16, #0x70]\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "ldp x26, x25, [x16, #0x0]\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "fmla v29.4s, v5.4s, v10.4s\n" + "ldr q12, [x23, x14]\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldr q16, [x15, #0x0]\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "ldr q4, [x15, #0x50]\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q9, [x22, x14]\n" + "fmla v28.4s, v2.4s, v12.4s\n" + "ldr x22, [x16, #0x20]\n" + "ldr q0, [x15, #0x10]\n" + "fmla v31.4s, v8.4s, v10.4s\n" + "ldr q1, [x15, #0x20]\n" + "fmla v30.4s, v7.4s, v10.4s\n" + "ldr q10, [x21, x14]\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "ldr q13, [x22, x11]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x20, x14]\n" + "ldr q2, [x15, #0x30]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "fmla v28.4s, v5.4s, v10.4s\n" + "ldr q12, [x19, x14]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v6.4s, v9.4s\n" + "ldr q9, [x26, x11]\n" + "fmla v29.4s, v7.4s, v11.4s\n" + "ldr q3, [x15, #0x40]\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr q10, [x25, x11]\n" + "fmla v28.4s, v6.4s, v11.4s\n" + "ldr q11, [x24, x11]\n" + "ldr q5, [x15, #0x60]\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "ldr q6, [x15, #0x70]\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "ldr q8, [x15, #0x90]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "ldr q12, [x23, x11]\n" + "add x11, x11, #0x10\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "ldr q7, [x15, #0x80]\n" + "cmp x11, x27, LSL #4\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "str q31, [x13, x28]\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "add x15, x15, #0xa0\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q30, [x12, x28]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "str q29, [x10, x28]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q28, [x9, x28]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n" + "ldr x21, [x16, #0x28]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n" + "ldr x20, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n" + "ldr x19, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ldr q9, [x21, x14]\n" + "ldr x26, [x16, #0x40]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr x25, [x16, #0x48]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "ldr q11, [x20, x14]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q10, [x25, x14]\n" + "ldr x24, [x16, #0x50]\n" + "fmla v31.4s, v5.4s, v12.4s\n" + "ldr x23, [x16, #0x58]\n" + "fmla v30.4s, v4.4s, v12.4s\n" + "fmla v29.4s, v6.4s, v9.4s\n" + "ldr q12, [x19, x14]\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr q9, [x26, x14]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.4s, v7.4s, v13.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.4s, v6.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "ldr x20, [x16, #0x70]\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "fmla v29.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "ldr q9, [x22, x14]\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "fmla v28.4s, v2.4s, v12.4s\n" + "fmla v31.4s, v8.4s, v10.4s\n" + "fmla v30.4s, v7.4s, v10.4s\n" + "ldr q10, [x21, x14]\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x20, x14]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x19, x14]\n" + "add x14, x14, #0x10\n" + "fmla v28.4s, v5.4s, v10.4s\n" + "fmla v31.4s, v6.4s, v9.4s\n" + "fmla v29.4s, v7.4s, v11.4s\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "fmla v28.4s, v6.4s, v11.4s\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "str q31, [x13, x28]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "str q30, [x12, x28]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q29, [x10, x28]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q28, [x9, x28]\n" + "3:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 30f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x28, x14\n" + "ldr q1, [x15, #0x20]\n" + "add x13, x13, x28\n" + "ldr q2, [x15, #0x30]\n" + "add x12, x12, x28\n" + "ldr q3, [x15, #0x40]\n" + "add x10, x10, x28\n" + "ldr q4, [x15, #0x50]\n" + "add x9, x9, x28\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x26, [x16, #0x0]\n" + "ldr x25, [x16, #0x8]\n" + "add x26, x26, x14\n" + "ldr x24, [x16, #0x10]\n" + "ldr x23, [x16, #0x18]\n" + "add x25, x25, x14\n" + "ldr x22, [x16, #0x20]\n" + "add x24, x24, x14\n" + "add x23, x23, x14\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.d }[0], [x26], #0x8\n" + "ld1 { v10.d }[0], [x25], #0x8\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "ld1 { v12.d }[0], [x23], #0x8\n" + "ld1 { v13.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.s }[2], [x26], #0x4\n" + "ld1 { v10.s }[2], [x25], #0x4\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "ld1 { v12.s }[2], [x23], #0x4\n" + "ld1 { v13.s }[2], [x22], #0x4\n" + "b 5f\n" + "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset + "ld1 { v9.s }[0], [x26], #0x4\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "ld1 { v13.s }[0], [x22], #0x4\n" + "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.4s, v4.4s, v9.4s\n" + "ldr x21, [x16, #0x28]\n" + "add x21, x21, x14\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v3.4s, v9.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v1.4s, v9.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v31.4s, v5.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v9.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v9.s }[2], [x21], #0x4\n" + "b 7f\n" + "6:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v9.s }[0], [x21], #0x4\n" + "7:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v29.4s, v6.4s, v9.4s\n" + "ldr x20, [x16, #0x30]\n" + "fmla v31.4s, v7.4s, v13.4s\n" + "add x20, x20, x14\n" + "fmla v30.4s, v6.4s, v13.4s\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v11.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v11.s }[2], [x20], #0x4\n" + "b 9f\n" + "8:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v11.s }[0], [x20], #0x4\n" + "9:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v28.4s, v8.4s, v11.4s\n" + "ldr x19, [x16, #0x38]\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v12.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v12.s }[2], [x19], #0x4\n" + "b 11f\n" + "10:" // Oddments: Load input (0, 1): Bit 1: Unset + "ld1 { v12.s }[0], [x19], #0x4\n" + "11:" // Oddments: Load input (0, 1): Bit 1: End + "fmla v31.4s, v1.4s, v12.4s\n" + "ldr x26, [x16, #0x40]\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v9.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v9.s }[2], [x26], #0x4\n" + "b 13f\n" + "12:" // Oddments: Load input (0, 2): Bit 1: Unset + "ld1 { v9.s }[0], [x26], #0x4\n" + "13:" // Oddments: Load input (0, 2): Bit 1: End + "fmla v31.4s, v2.4s, v9.4s\n" + "ldr x25, [x16, #0x48]\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v10.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v10.s }[2], [x25], #0x4\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 2): Bit 1: Unset + "ld1 { v10.s }[0], [x25], #0x4\n" + "15:" // Oddments: Load input (2, 2): Bit 1: End + "fmla v31.4s, v8.4s, v10.4s\n" + "ldr x24, [x16, #0x50]\n" + "fmla v30.4s, v7.4s, v10.4s\n" + "add x24, x24, x14\n" + "fmla v29.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "b 17f\n" + "16:" // Oddments: Load input (1, 0): Bit 1: Unset + "ld1 { v11.s }[0], [x24], #0x4\n" + "17:" // Oddments: Load input (1, 0): Bit 1: End + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr x23, [x16, #0x58]\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v12.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v12.s }[2], [x23], #0x4\n" + "b 19f\n" + "18:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v12.s }[0], [x23], #0x4\n" + "19:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr x22, [x16, #0x60]\n" + "fmla v28.4s, v2.4s, v12.4s\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v9.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v9.s }[2], [x22], #0x4\n" + "b 21f\n" + "20:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v9.s }[0], [x22], #0x4\n" + "21:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v9.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "add x21, x21, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v10.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v10.s }[2], [x21], #0x4\n" + "b 23f\n" + "22:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v10.s }[0], [x21], #0x4\n" + "23:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr x20, [x16, #0x70]\n" + "fmla v28.4s, v5.4s, v10.4s\n" + "add x20, x20, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v11.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v11.s }[2], [x20], #0x4\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v11.s }[0], [x20], #0x4\n" + "25:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v29.4s, v7.4s, v11.4s\n" + "ldr x19, [x16, #0x78]\n" + "fmla v28.4s, v6.4s, v11.4s\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v12.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v12.s }[2], [x19], #0x4\n" + "b 27f\n" + "26:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v12.s }[0], [x19], #0x4\n" + "27:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v29.4s, v8.4s, v12.4s\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "tbz %x[n_channels], #1, 28f\n" + "st1 { v31.d }[0], [x13], #0x8\n" + "st1 { v30.d }[0], [x12], #0x8\n" + "st1 { v29.d }[0], [x10], #0x8\n" + "st1 { v28.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 29f\n" + "st1 { v31.s }[2], [x13], #0x4\n" + "st1 { v30.s }[2], [x12], #0x4\n" + "st1 { v29.s }[2], [x10], #0x4\n" + "st1 { v28.s }[2], [x9], #0x4\n" + "b 29f\n" + "28:" // Oddments: Store: Bit 1: Unset + "st1 { v31.s }[0], [x13], #0x4\n" + "st1 { v30.s }[0], [x12], #0x4\n" + "st1 { v29.s }[0], [x10], #0x4\n" + "st1 { v28.s }[0], [x9], #0x4\n" + "29:" // Oddments: Store: Bit 1: End + + "30:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp new file mode 100644 index 0000000000..6a882ec52f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 3; + constexpr static unsigned int output_cols = 3; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + + a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..401528aa59 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x7, #0x0\n" + "mov x8, #0x0\n" + "1:" // Tile loop + "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x26, #0x3\n" + "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x25, #0x3\n" + "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x23, #0x0\n" + "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x7, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col + "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x26\n" // offset *= kernel_stride * output_size + "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x15, x15, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1r { v18.4s }, [x24]\n" + "add x12, x15, x22, LSL #2\n" + "ld1r { v17.4s }, [x21]\n" + "add x11, x12, x22, LSL #2\n" + "lsl x16, x16, #0x2\n" + "add x10, x11, x22, LSL #2\n" + "add x9, x10, x22, LSL #2\n" + "add x28, x16, x16\n" + "add x27, x28, x16\n" + "add x26, x27, x16\n" + "mul x19, x7, x20\n" // offset = tile_i * ld_output_row + "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x25\n" // offset *= output_tile_size + "add x13, x13, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x25, x13, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "lsl x14, x14, #0x2\n" + "add x22, x14, x14\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x2\n" + "cbz x19, 4f\n" + "ldr q16, [x17, #0x0]\n" + "ldr q0, [x17, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x17, #0x20]\n" + "ldr q2, [x17, #0x30]\n" + "ldr q3, [x17, #0x40]\n" + "ldr q4, [x17, #0x50]\n" + "ldr q5, [x17, #0x60]\n" + "ldr q6, [x17, #0x70]\n" + "ldr q7, [x17, #0x80]\n" + "ldr q8, [x17, #0x90]\n" + "add x17, x17, #0xa0\n" + "ldr q9, [x11, x28]\n" + "ld1 { v10.4s }, [x15]\n" + "ldr q11, [x15, x26]\n" + "ld1 { v12.4s }, [x9]\n" + "ldr q13, [x12, x28]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "add x23, x23, #0x10\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "add x21, x21, #0x10\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n" + "cmp x21, x19, LSL #4\n" + "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n" + "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n" + "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n" + "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n" + "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n" + "ldr q16, [x17, #0x0]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x11, x27]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr q11, [x11, x16]\n" + "fmla v25.4s, v6.4s, v12.4s\n" + "ldr q12, [x9, x26]\n" + "fmla v30.4s, v4.4s, v13.4s\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v13.4s\n" + "fmla v28.4s, v2.4s, v13.4s\n" + "fmla v27.4s, v1.4s, v13.4s\n" + "fmla v26.4s, v0.4s, v13.4s\n" + "ldr q13, [x15, x16]\n" + "fmla v23.4s, v8.4s, v12.4s\n" + "ldr q12, [x15, x27]\n" + "fmla v31.4s, v7.4s, v11.4s\n" + "fmla v30.4s, v6.4s, v11.4s\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v27.4s, v3.4s, v11.4s\n" + "fmla v25.4s, v1.4s, v11.4s\n" + "fmla v24.4s, v0.4s, v11.4s\n" + "ld1 { v11.4s }, [x12]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "ldr q13, [x12, x26]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v5.4s, v10.4s\n" + "fmla v26.4s, v4.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "ld1 { v12.4s }, [x10]\n" + "fmla v29.4s, v7.4s, v10.4s\n" + "fmla v24.4s, v2.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr q10, [x10, x28]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q11, [x10, x26]\n" + "fmla v29.4s, v5.4s, v13.4s\n" + "fmla v26.4s, v2.4s, v13.4s\n" + "ldr q13, [x9, x16]\n" + "fmla v25.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr q12, [x12, x16]\n" + "fmla v27.4s, v7.4s, v10.4s\n" + "fmla v26.4s, v6.4s, v10.4s\n" + "fmla v25.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v8.4s, v10.4s\n" + "fmla v24.4s, v4.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v26.4s, v8.4s, v11.4s\n" + "fmla v25.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v6.4s, v13.4s\n" + "ldr q13, [x9, x27]\n" + "fmla v23.4s, v5.4s, v11.4s\n" + "ldr q11, [x12, x27]\n" + "add x12, x12, #0x10\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v0.4s, v12.4s\n" + "ldr q12, [x10, x16]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "fmla v30.4s, v5.4s, v11.4s\n" + "fmla v26.4s, v1.4s, v11.4s\n" + "fmla v27.4s, v2.4s, v11.4s\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v24.4s, v8.4s, v13.4s\n" + "ld1 { v10.4s }, [x15]\n" + "fmla v23.4s, v7.4s, v13.4s\n" + "ldr q13, [x10, x27]\n" + "add x10, x10, #0x10\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v27.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "fmla v24.4s, v3.4s, v12.4s\n" + "ld1 { v12.4s }, [x11]\n" + "fmla v31.4s, v2.4s, v11.4s\n" + "fmla v30.4s, v1.4s, v11.4s\n" + "ldr q1, [x17, #0x20]\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x11, x26]\n" + "add x11, x11, #0x10\n" + "fmla v27.4s, v8.4s, v13.4s\n" + "ldr q9, [x11, x28]\n" + "fmla v26.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v5.4s, v13.4s\n" + "fmla v23.4s, v4.4s, v13.4s\n" + "ldr q13, [x9, x28]\n" + "add x9, x9, #0x10\n" + "fmla v31.4s, v6.4s, v12.4s\n" + "ldr q4, [x17, #0x50]\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr q3, [x17, #0x40]\n" + "fmla v25.4s, v0.4s, v12.4s\n" + "ld1 { v12.4s }, [x9]\n" + "fmla v29.4s, v8.4s, v11.4s\n" + "ldr q0, [x17, #0x10]\n" + "fmla v26.4s, v5.4s, v11.4s\n" + "ldr q5, [x17, #0x60]\n" + "fmla v23.4s, v2.4s, v11.4s\n" + "ldr q11, [x15, x26]\n" + "fmla v25.4s, v8.4s, v13.4s\n" + "ldr q2, [x17, #0x30]\n" + "fmla v24.4s, v7.4s, v13.4s\n" + "ldr q7, [x17, #0x80]\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "ldr q8, [x17, #0x90]\n" + "fmla v23.4s, v6.4s, v13.4s\n" + "ldr q13, [x12, x28]\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "ldr q6, [x17, #0x70]\n" + "add x17, x17, #0xa0\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "st1 { v31.4s }, [x13]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "str q30, [x13, x14]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q29, [x13, x22]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "add x13, x13, #0x10\n" + "fmax v27.4s, v27.4s, v18.4s\n" + "st1 { v28.4s }, [x25]\n" + "fmax v26.4s, v26.4s, v18.4s\n" + "fmax v25.4s, v25.4s, v18.4s\n" + "fmin v27.4s, v27.4s, v17.4s\n" + "str q27, [x25, x14]\n" + "fmin v26.4s, v26.4s, v17.4s\n" + "fmin v25.4s, v25.4s, v17.4s\n" + "str q26, [x25, x22]\n" + "fmax v24.4s, v24.4s, v18.4s\n" + "add x25, x25, #0x10\n" + "fmax v23.4s, v23.4s, v18.4s\n" + "st1 { v25.4s }, [x24]\n" + "fmin v24.4s, v24.4s, v17.4s\n" + "str q24, [x24, x14]\n" + "fmin v23.4s, v23.4s, v17.4s\n" + "str q23, [x24, x22]\n" + "add x24, x24, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n" + "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n" + "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n" + "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n" + "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n" + "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x11, x27]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr q11, [x11, x16]\n" + "fmla v25.4s, v6.4s, v12.4s\n" + "ldr q12, [x9, x26]\n" + "fmla v30.4s, v4.4s, v13.4s\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v13.4s\n" + "fmla v28.4s, v2.4s, v13.4s\n" + "fmla v27.4s, v1.4s, v13.4s\n" + "fmla v26.4s, v0.4s, v13.4s\n" + "ldr q13, [x15, x16]\n" + "fmla v23.4s, v8.4s, v12.4s\n" + "ldr q12, [x15, x27]\n" + "fmla v31.4s, v7.4s, v11.4s\n" + "fmla v30.4s, v6.4s, v11.4s\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v27.4s, v3.4s, v11.4s\n" + "fmla v25.4s, v1.4s, v11.4s\n" + "fmla v24.4s, v0.4s, v11.4s\n" + "ld1 { v11.4s }, [x12]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "ldr q13, [x12, x26]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v5.4s, v10.4s\n" + "fmla v26.4s, v4.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "ld1 { v12.4s }, [x10]\n" + "fmla v29.4s, v7.4s, v10.4s\n" + "fmla v24.4s, v2.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr q10, [x10, x28]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q11, [x10, x26]\n" + "fmla v29.4s, v5.4s, v13.4s\n" + "fmla v26.4s, v2.4s, v13.4s\n" + "ldr q13, [x9, x16]\n" + "fmla v25.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr q12, [x12, x16]\n" + "fmla v27.4s, v7.4s, v10.4s\n" + "fmla v26.4s, v6.4s, v10.4s\n" + "fmla v25.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v8.4s, v10.4s\n" + "fmla v24.4s, v4.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v26.4s, v8.4s, v11.4s\n" + "fmla v25.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v6.4s, v13.4s\n" + "ldr q13, [x9, x27]\n" + "fmla v23.4s, v5.4s, v11.4s\n" + "ldr q11, [x12, x27]\n" + "add x12, x12, #0x10\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v0.4s, v12.4s\n" + "ldr q12, [x10, x16]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "fmla v30.4s, v5.4s, v11.4s\n" + "fmla v26.4s, v1.4s, v11.4s\n" + "fmla v27.4s, v2.4s, v11.4s\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v24.4s, v8.4s, v13.4s\n" + "fmla v23.4s, v7.4s, v13.4s\n" + "ldr q13, [x10, x27]\n" + "add x10, x10, #0x10\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v27.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "fmla v24.4s, v3.4s, v12.4s\n" + "ld1 { v12.4s }, [x11]\n" + "fmla v31.4s, v2.4s, v11.4s\n" + "fmla v30.4s, v1.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x11, x26]\n" + "add x11, x11, #0x10\n" + "fmla v27.4s, v8.4s, v13.4s\n" + "fmla v26.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v5.4s, v13.4s\n" + "fmla v23.4s, v4.4s, v13.4s\n" + "ldr q13, [x9, x28]\n" + "add x9, x9, #0x10\n" + "fmla v31.4s, v6.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v25.4s, v0.4s, v12.4s\n" + "fmla v29.4s, v8.4s, v11.4s\n" + "fmla v26.4s, v5.4s, v11.4s\n" + "fmla v23.4s, v2.4s, v11.4s\n" + "fmla v25.4s, v8.4s, v13.4s\n" + "fmla v24.4s, v7.4s, v13.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmla v23.4s, v6.4s, v13.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "st1 { v31.4s }, [x13]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "str q30, [x13, x14]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q29, [x13, x22]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "add x13, x13, #0x10\n" + "fmax v27.4s, v27.4s, v18.4s\n" + "st1 { v28.4s }, [x25]\n" + "fmax v26.4s, v26.4s, v18.4s\n" + "fmax v25.4s, v25.4s, v18.4s\n" + "fmin v27.4s, v27.4s, v17.4s\n" + "str q27, [x25, x14]\n" + "fmin v26.4s, v26.4s, v17.4s\n" + "fmin v25.4s, v25.4s, v17.4s\n" + "str q26, [x25, x22]\n" + "fmax v24.4s, v24.4s, v18.4s\n" + "add x25, x25, #0x10\n" + "fmax v23.4s, v23.4s, v18.4s\n" + "st1 { v25.4s }, [x24]\n" + "fmin v24.4s, v24.4s, v17.4s\n" + "str q24, [x24, x14]\n" + "fmin v23.4s, v23.4s, v17.4s\n" + "str q23, [x24, x22]\n" + "add x24, x24, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x3\n" + "beq 49f\n" + "ldr q16, [x17, #0x0]\n" + "ldr q0, [x17, #0x10]\n" + "add x23, x11, x28\n" + "ldr q1, [x17, #0x20]\n" + "add x22, x15, XZR\n" + "ldr q2, [x17, #0x30]\n" + "add x21, x15, x26\n" + "ldr q3, [x17, #0x40]\n" + "add x20, x9, XZR\n" + "ldr q4, [x17, #0x50]\n" + "add x19, x12, x28\n" + "ldr q5, [x17, #0x60]\n" + "ldr q6, [x17, #0x70]\n" + "ldr q7, [x17, #0x80]\n" + "ldr q8, [x17, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr d9, [x23], #0x8\n" + "ldr d10, [x22], #0x8\n" + "ldr d11, [x21], #0x8\n" + "ldr d12, [x20], #0x8\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.s }[2], [x23]\n" + "ld1 { v10.s }[2], [x22]\n" + "ld1 { v11.s }[2], [x21]\n" + "ld1 { v12.s }[2], [x20]\n" + "ld1 { v13.s }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset + "ldr s9, [x23, #0x0]\n" + "ldr s10, [x22, #0x0]\n" + "ldr s11, [x21, #0x0]\n" + "ldr s12, [x20, #0x0]\n" + "ldr s13, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x19, x9, x26\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n" + "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n" + "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n" + "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n" + "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n" + "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "fmla v25.4s, v6.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v13.4s\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v13.4s\n" + "fmla v28.4s, v2.4s, v13.4s\n" + "fmla v27.4s, v1.4s, v13.4s\n" + "fmla v26.4s, v0.4s, v13.4s\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v23.4s, v8.4s, v12.4s\n" + "add x19, x11, x16\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v31.4s, v7.4s, v11.4s\n" + "add x19, x15, x16\n" + "fmla v30.4s, v6.4s, v11.4s\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v27.4s, v3.4s, v11.4s\n" + "fmla v25.4s, v1.4s, v11.4s\n" + "fmla v24.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End + "fmla v31.4s, v1.4s, v13.4s\n" + "add x19, x15, x27\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End + "fmla v30.4s, v2.4s, v12.4s\n" + "add x19, x11, x27\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v30.4s, v8.4s, v10.4s\n" + "add x19, x12, XZR\n" + "fmla v29.4s, v7.4s, v10.4s\n" + "fmla v27.4s, v5.4s, v10.4s\n" + "fmla v26.4s, v4.4s, v10.4s\n" + "fmla v24.4s, v2.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End + "fmla v31.4s, v3.4s, v11.4s\n" + "add x19, x12, x26\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v29.4s, v5.4s, v13.4s\n" + "add x19, x10, XZR\n" + "fmla v26.4s, v2.4s, v13.4s\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v28.4s, v6.4s, v12.4s\n" + "add x19, x10, x28\n" + "fmla v25.4s, v3.4s, v12.4s\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v28.4s, v8.4s, v10.4s\n" + "add x19, x10, x26\n" + "fmla v27.4s, v7.4s, v10.4s\n" + "fmla v26.4s, v6.4s, v10.4s\n" + "fmla v25.4s, v5.4s, v10.4s\n" + "fmla v24.4s, v4.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v26.4s, v8.4s, v11.4s\n" + "add x19, x9, x16\n" + "fmla v23.4s, v5.4s, v11.4s\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v25.4s, v7.4s, v13.4s\n" + "add x19, x12, x16\n" + "fmla v24.4s, v6.4s, v13.4s\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End + "fmla v31.4s, v4.4s, v12.4s\n" + "add x19, x12, x27\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.4s, v5.4s, v11.4s\n" + "add x19, x9, x27\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "fmla v27.4s, v2.4s, v11.4s\n" + "fmla v26.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v24.4s, v8.4s, v13.4s\n" + "add x19, x10, x16\n" + "fmla v23.4s, v7.4s, v13.4s\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v28.4s, v7.4s, v12.4s\n" + "add x19, x15, x28\n" + "fmla v27.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "fmla v24.4s, v3.4s, v12.4s\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End + "fmla v31.4s, v2.4s, v11.4s\n" + "add x19, x10, x27\n" + "fmla v30.4s, v1.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v27.4s, v8.4s, v13.4s\n" + "add x19, x11, XZR\n" + "fmla v26.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v5.4s, v13.4s\n" + "fmla v23.4s, v4.4s, v13.4s\n" + "tbz %x[n_channels], #1, 41f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 42f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v12.4s\n" + "add x19, x11, x26\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v25.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 43f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 44f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 44f\n" + "43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v29.4s, v8.4s, v11.4s\n" + "add x19, x9, x28\n" + "fmla v26.4s, v5.4s, v11.4s\n" + "fmla v23.4s, v2.4s, v11.4s\n" + "tbz %x[n_channels], #1, 45f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 46f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 46f\n" + "45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v25.4s, v8.4s, v13.4s\n" + "fmla v24.4s, v7.4s, v13.4s\n" + "fmla v23.4s, v6.4s, v13.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "fmax v27.4s, v27.4s, v18.4s\n" + "fmax v26.4s, v26.4s, v18.4s\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "fmin v27.4s, v27.4s, v17.4s\n" + "fmin v26.4s, v26.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v18.4s\n" + "fmax v24.4s, v24.4s, v18.4s\n" + "fmax v23.4s, v23.4s, v18.4s\n" + "fmin v25.4s, v25.4s, v17.4s\n" + "fmin v24.4s, v24.4s, v17.4s\n" + "fmin v23.4s, v23.4s, v17.4s\n" + "tbz %x[n_channels], #1, 47f\n" + "mov x19, x13\n" + "st1 { v31.d }[0], [x19], x14\n" + "add x13, x13, #0x8\n" + "st1 { v30.d }[0], [x19], x14\n" + "mov x20, x25\n" + "st1 { v29.d }[0], [x19]\n" + "st1 { v28.d }[0], [x20], x14\n" + "add x25, x25, #0x8\n" + "st1 { v27.d }[0], [x20], x14\n" + "mov x19, x24\n" + "st1 { v26.d }[0], [x20]\n" + "add x24, x24, #0x8\n" + "st1 { v25.d }[0], [x19], x14\n" + "st1 { v24.d }[0], [x19], x14\n" + "st1 { v23.d }[0], [x19]\n" + "tbz %x[n_channels], #0, 48f\n" + "mov x21, x13\n" + "st1 { v31.s }[2], [x21], x14\n" + "mov x20, x25\n" + "st1 { v30.s }[2], [x21], x14\n" + "st1 { v28.s }[2], [x20], x14\n" + "mov x19, x24\n" + "st1 { v29.s }[2], [x21]\n" + "st1 { v27.s }[2], [x20], x14\n" + "st1 { v26.s }[2], [x20]\n" + "st1 { v25.s }[2], [x19], x14\n" + "st1 { v24.s }[2], [x19], x14\n" + "st1 { v23.s }[2], [x19]\n" + "b 48f\n" + "47:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x21, x13\n" + "st1 { v31.s }[0], [x21], x14\n" + "mov x20, x25\n" + "mov x19, x24\n" + "st1 { v30.s }[0], [x21], x14\n" + "st1 { v28.s }[0], [x20], x14\n" + "st1 { v29.s }[0], [x21]\n" + "st1 { v27.s }[0], [x20], x14\n" + "st1 { v26.s }[0], [x20]\n" + "st1 { v25.s }[0], [x19], x14\n" + "st1 { v24.s }[0], [x19], x14\n" + "st1 { v23.s }[0], [x19]\n" + "48:" // Tile loop: Oddments: Store: Bit 1: End + + "49:" // Tile loop: End + "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x7, #0x1\n" + "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x8, x8, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x8, x19\n" + "csel x8, x8, XZR, LT\n" + "csel x7, x7, x21, LT\n" + "cmp x7, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..39ec001ae1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,903 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[25]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[4]; + inptrs[3] = input_ptrs[20]; + inptrs[4] = input_ptrs[7]; + inptrs[5] = input_ptrs[24]; + inptrs[6] = input_ptrs[11]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[3]; + inptrs[9] = input_ptrs[13]; + inptrs[10] = input_ptrs[5]; + inptrs[11] = input_ptrs[9]; + inptrs[12] = input_ptrs[15]; + inptrs[13] = input_ptrs[17]; + inptrs[14] = input_ptrs[19]; + inptrs[15] = input_ptrs[21]; + inptrs[16] = input_ptrs[6]; + inptrs[17] = input_ptrs[8]; + inptrs[18] = input_ptrs[23]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[18]; + inptrs[22] = input_ptrs[10]; + inptrs[23] = input_ptrs[14]; + inptrs[24] = input_ptrs[22]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v18.4s }, [x20]\n" + "ld1r { v17.4s }, [x19]\n" + "mov x14, #0x0\n" + "mov x13, #0x10\n" // cntb _, ALL, #1 + "sub x12, XZR, x13\n" + "lsr x11, %x[n_channels], #0x2\n" + "cbz x11, 3f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x13, x11, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x10, x9, [x16, #0x0]\n" + "ldp x28, x27, [x16, #0x10]\n" + "ldr x26, [x16, #0x20]\n" + "ldr q9, [x10, x14]\n" + "ldr q10, [x9, x14]\n" + "ldr q11, [x28, x14]\n" + "ldr q12, [x27, x14]\n" + "ldr q13, [x26, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x25, [x16, #0x28]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "ldr x24, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "ldr x23, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n" + "ldr x10, [x16, #0x40]\n" + "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n" + "ldr x9, [x16, #0x48]\n" + "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n" + "ldr x28, [x16, #0x50]\n" + "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n" + "ldr x27, [x16, #0x58]\n" + "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n" + "ldr x26, [x16, #0x60]\n" + "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n" + "ldr x22, [x17, #0x0]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x9, x14]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v25.4s, v6.4s, v12.4s\n" + "ldr q12, [x25, x14]\n" + "fmla v30.4s, v4.4s, v13.4s\n" + "ldr x25, [x16, #0x68]\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "ldr x24, [x16, #0x70]\n" + "fmla v29.4s, v3.4s, v13.4s\n" + "ldr x9, [x16, #0x88]\n" + "fmla v28.4s, v2.4s, v13.4s\n" + "ldr x21, [x17, #0x8]\n" + "fmla v27.4s, v1.4s, v13.4s\n" + "ldr x20, [x17, #0x10]\n" + "fmla v26.4s, v0.4s, v13.4s\n" + "ldr q13, [x23, x14]\n" + "fmla v23.4s, v8.4s, v12.4s\n" + "ldr q12, [x10, x14]\n" + "fmla v31.4s, v7.4s, v11.4s\n" + "ldr x23, [x16, #0x78]\n" + "fmla v30.4s, v6.4s, v11.4s\n" + "ldr x10, [x16, #0x80]\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "ldr x19, [x17, #0x18]\n" + "fmla v27.4s, v3.4s, v11.4s\n" + "ldr q16, [x15, #0x0]\n" + "fmla v25.4s, v1.4s, v11.4s\n" + "fmla v24.4s, v0.4s, v11.4s\n" + "ldr q11, [x28, x14]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "ldr q13, [x27, x14]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "ldr x27, [x16, #0x98]\n" + "fmla v27.4s, v5.4s, v10.4s\n" + "fmla v26.4s, v4.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "ldr q12, [x26, x14]\n" + "fmla v29.4s, v7.4s, v10.4s\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v24.4s, v2.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr q10, [x25, x14]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v29.4s, v5.4s, v13.4s\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.4s, v2.4s, v13.4s\n" + "ldr q13, [x23, x14]\n" + "fmla v25.4s, v3.4s, v12.4s\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr q12, [x10, x14]\n" + "fmla v27.4s, v7.4s, v10.4s\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v26.4s, v6.4s, v10.4s\n" + "fmla v25.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v8.4s, v10.4s\n" + "fmla v24.4s, v4.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v26.4s, v8.4s, v11.4s\n" + "fmla v25.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v6.4s, v13.4s\n" + "ldr q13, [x28, x14]\n" + "fmla v23.4s, v5.4s, v11.4s\n" + "ldr q11, [x9, x14]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v0.4s, v12.4s\n" + "ldr q12, [x27, x14]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "fmla v30.4s, v5.4s, v11.4s\n" + "fmla v26.4s, v1.4s, v11.4s\n" + "fmla v27.4s, v2.4s, v11.4s\n" + "ldr q11, [x26, x14]\n" + "fmla v24.4s, v8.4s, v13.4s\n" + "ldr x26, [x16, #0x20]\n" + "fmla v23.4s, v7.4s, v13.4s\n" + "ldr q13, [x25, x14]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v27.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "fmla v24.4s, v3.4s, v12.4s\n" + "ldr q12, [x24, x14]\n" + "fmla v31.4s, v2.4s, v11.4s\n" + "fmla v30.4s, v1.4s, v11.4s\n" + "ldr q1, [x15, #0x20]\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x23, x14]\n" + "fmla v27.4s, v8.4s, v13.4s\n" + "fmla v26.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v5.4s, v13.4s\n" + "fmla v23.4s, v4.4s, v13.4s\n" + "ldr q13, [x10, x14]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v6.4s, v12.4s\n" + "ldp x10, x9, [x16, #0x0]\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldp x28, x27, [x16, #0x10]\n" + "fmla v25.4s, v0.4s, v12.4s\n" + "ldr q0, [x15, #0x10]\n" + "fmla v29.4s, v8.4s, v11.4s\n" + "ldr q9, [x10, x13]\n" + "fmla v26.4s, v5.4s, v11.4s\n" + "ldr q10, [x9, x13]\n" + "fmla v23.4s, v2.4s, v11.4s\n" + "ldr q11, [x28, x13]\n" + "fmla v25.4s, v8.4s, v13.4s\n" + "ldr q12, [x27, x13]\n" + "fmla v24.4s, v7.4s, v13.4s\n" + "ldr q2, [x15, #0x30]\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "ldr q3, [x15, #0x40]\n" + "fmla v23.4s, v6.4s, v13.4s\n" + "ldr q13, [x26, x13]\n" + "add x13, x13, #0x10\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "ldr q4, [x15, #0x50]\n" + "cmp x13, x11, LSL #4\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "ldr q5, [x15, #0x60]\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "ldr q6, [x15, #0x70]\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q31, [x22, x12]\n" + "fmax v27.4s, v27.4s, v18.4s\n" + "ldr x22, [x17, #0x20]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "ldr q7, [x15, #0x80]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q30, [x21, x12]\n" + "fmin v27.4s, v27.4s, v17.4s\n" + "str q29, [x20, x12]\n" + "fmax v26.4s, v26.4s, v18.4s\n" + "ldr x21, [x17, #0x28]\n" + "fmax v25.4s, v25.4s, v18.4s\n" + "str q28, [x19, x12]\n" + "fmax v24.4s, v24.4s, v18.4s\n" + "str q27, [x22, x12]\n" + "fmin v26.4s, v26.4s, v17.4s\n" + "ldr x20, [x17, #0x30]\n" + "fmin v25.4s, v25.4s, v17.4s\n" + "ldr x19, [x17, #0x38]\n" + "fmin v24.4s, v24.4s, v17.4s\n" + "str q26, [x21, x12]\n" + "fmax v23.4s, v23.4s, v18.4s\n" + "str q25, [x20, x12]\n" + "ldr x22, [x17, #0x40]\n" + "fmin v23.4s, v23.4s, v17.4s\n" + "str q24, [x19, x12]\n" + "str q23, [x22, x12]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x25, [x16, #0x28]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "ldr x24, [x16, #0x30]\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "ldr x23, [x16, #0x38]\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n" + "ldr x10, [x16, #0x40]\n" + "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n" + "ldr x9, [x16, #0x48]\n" + "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n" + "ldr x28, [x16, #0x50]\n" + "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n" + "ldr x27, [x16, #0x58]\n" + "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n" + "ldr x26, [x16, #0x60]\n" + "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n" + "ldr x22, [x17, #0x0]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x9, x14]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v25.4s, v6.4s, v12.4s\n" + "ldr q12, [x25, x14]\n" + "fmla v30.4s, v4.4s, v13.4s\n" + "ldr x25, [x16, #0x68]\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "ldr x24, [x16, #0x70]\n" + "fmla v29.4s, v3.4s, v13.4s\n" + "ldr x9, [x16, #0x88]\n" + "fmla v28.4s, v2.4s, v13.4s\n" + "ldr x21, [x17, #0x8]\n" + "fmla v27.4s, v1.4s, v13.4s\n" + "ldr x20, [x17, #0x10]\n" + "fmla v26.4s, v0.4s, v13.4s\n" + "ldr q13, [x23, x14]\n" + "fmla v23.4s, v8.4s, v12.4s\n" + "ldr q12, [x10, x14]\n" + "fmla v31.4s, v7.4s, v11.4s\n" + "ldr x23, [x16, #0x78]\n" + "fmla v30.4s, v6.4s, v11.4s\n" + "ldr x10, [x16, #0x80]\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "ldr x19, [x17, #0x18]\n" + "fmla v27.4s, v3.4s, v11.4s\n" + "fmla v25.4s, v1.4s, v11.4s\n" + "fmla v24.4s, v0.4s, v11.4s\n" + "ldr q11, [x28, x14]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "ldr q13, [x27, x14]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "ldr x27, [x16, #0x98]\n" + "fmla v27.4s, v5.4s, v10.4s\n" + "fmla v26.4s, v4.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "ldr q12, [x26, x14]\n" + "fmla v29.4s, v7.4s, v10.4s\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v24.4s, v2.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr q10, [x25, x14]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v29.4s, v5.4s, v13.4s\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.4s, v2.4s, v13.4s\n" + "ldr q13, [x23, x14]\n" + "fmla v25.4s, v3.4s, v12.4s\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr q12, [x10, x14]\n" + "fmla v27.4s, v7.4s, v10.4s\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v26.4s, v6.4s, v10.4s\n" + "fmla v25.4s, v5.4s, v10.4s\n" + "fmla v28.4s, v8.4s, v10.4s\n" + "fmla v24.4s, v4.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v26.4s, v8.4s, v11.4s\n" + "fmla v25.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v6.4s, v13.4s\n" + "ldr q13, [x28, x14]\n" + "fmla v23.4s, v5.4s, v11.4s\n" + "ldr q11, [x9, x14]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v0.4s, v12.4s\n" + "ldr q12, [x27, x14]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "fmla v30.4s, v5.4s, v11.4s\n" + "fmla v26.4s, v1.4s, v11.4s\n" + "fmla v27.4s, v2.4s, v11.4s\n" + "ldr q11, [x26, x14]\n" + "fmla v24.4s, v8.4s, v13.4s\n" + "fmla v23.4s, v7.4s, v13.4s\n" + "ldr q13, [x25, x14]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v27.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "fmla v24.4s, v3.4s, v12.4s\n" + "ldr q12, [x24, x14]\n" + "fmla v31.4s, v2.4s, v11.4s\n" + "fmla v30.4s, v1.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x23, x14]\n" + "fmla v27.4s, v8.4s, v13.4s\n" + "fmla v26.4s, v7.4s, v13.4s\n" + "fmla v24.4s, v5.4s, v13.4s\n" + "fmla v23.4s, v4.4s, v13.4s\n" + "ldr q13, [x10, x14]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v6.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v25.4s, v0.4s, v12.4s\n" + "fmla v29.4s, v8.4s, v11.4s\n" + "fmla v26.4s, v5.4s, v11.4s\n" + "fmla v23.4s, v2.4s, v11.4s\n" + "fmla v25.4s, v8.4s, v13.4s\n" + "fmla v24.4s, v7.4s, v13.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmla v23.4s, v6.4s, v13.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "str q31, [x22, x12]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "ldr x22, [x17, #0x20]\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q30, [x21, x12]\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v27.4s, v27.4s, v18.4s\n" + "ldr x21, [x17, #0x28]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q29, [x20, x12]\n" + "fmin v27.4s, v27.4s, v17.4s\n" + "fmax v26.4s, v26.4s, v18.4s\n" + "str q28, [x19, x12]\n" + "fmax v25.4s, v25.4s, v18.4s\n" + "ldr x20, [x17, #0x30]\n" + "fmax v24.4s, v24.4s, v18.4s\n" + "str q27, [x22, x12]\n" + "fmin v26.4s, v26.4s, v17.4s\n" + "ldr x19, [x17, #0x38]\n" + "fmin v25.4s, v25.4s, v17.4s\n" + "ldr x22, [x17, #0x40]\n" + "fmin v24.4s, v24.4s, v17.4s\n" + "str q26, [x21, x12]\n" + "fmax v23.4s, v23.4s, v18.4s\n" + "str q25, [x20, x12]\n" + "str q24, [x19, x12]\n" + "fmin v23.4s, v23.4s, v17.4s\n" + "str q23, [x22, x12]\n" + "3:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 48f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x12, x14\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x10, [x16, #0x0]\n" + "add x10, x10, x14\n" + "ldr x9, [x16, #0x8]\n" + "ldr x28, [x16, #0x10]\n" + "add x9, x9, x14\n" + "ldr x27, [x16, #0x18]\n" + "ldr x26, [x16, #0x20]\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.d }[0], [x10], #0x8\n" + "ld1 { v10.d }[0], [x9], #0x8\n" + "ld1 { v11.d }[0], [x28], #0x8\n" + "ld1 { v12.d }[0], [x27], #0x8\n" + "ld1 { v13.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.s }[2], [x10], #0x4\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v12.s }[2], [x27], #0x4\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "b 5f\n" + "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset + "ld1 { v9.s }[0], [x10], #0x4\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "ld1 { v11.s }[0], [x28], #0x4\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "ld1 { v13.s }[0], [x26], #0x4\n" + "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x25, [x16, #0x28]\n" + "add x25, x25, x14\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v5.4s, v9.4s\n" + "mov v27.16b, v16.16b\n fmla v27.4s, v4.4s, v9.4s\n" + "mov v26.16b, v16.16b\n fmla v26.4s, v3.4s, v9.4s\n" + "mov v25.16b, v16.16b\n fmla v25.4s, v2.4s, v9.4s\n" + "mov v24.16b, v16.16b\n fmla v24.4s, v1.4s, v9.4s\n" + "mov v23.16b, v16.16b\n fmla v23.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "fmla v25.4s, v6.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v13.4s\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v13.4s\n" + "fmla v28.4s, v2.4s, v13.4s\n" + "fmla v27.4s, v1.4s, v13.4s\n" + "fmla v26.4s, v0.4s, v13.4s\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v12.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "b 7f\n" + "6:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v12.s }[0], [x25], #0x4\n" + "7:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v23.4s, v8.4s, v12.4s\n" + "ldr x24, [x16, #0x30]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "b 9f\n" + "8:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v11.s }[0], [x24], #0x4\n" + "9:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v31.4s, v7.4s, v11.4s\n" + "ldr x23, [x16, #0x38]\n" + "fmla v30.4s, v6.4s, v11.4s\n" + "add x23, x23, x14\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v27.4s, v3.4s, v11.4s\n" + "fmla v25.4s, v1.4s, v11.4s\n" + "fmla v24.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v13.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v13.s }[2], [x23], #0x4\n" + "b 11f\n" + "10:" // Oddments: Load input (0, 1): Bit 1: Unset + "ld1 { v13.s }[0], [x23], #0x4\n" + "11:" // Oddments: Load input (0, 1): Bit 1: End + "fmla v31.4s, v1.4s, v13.4s\n" + "ldr x10, [x16, #0x40]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "add x10, x10, x14\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v12.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v12.s }[2], [x10], #0x4\n" + "b 13f\n" + "12:" // Oddments: Load input (0, 3): Bit 1: Unset + "ld1 { v12.s }[0], [x10], #0x4\n" + "13:" // Oddments: Load input (0, 3): Bit 1: End + "fmla v30.4s, v2.4s, v12.4s\n" + "ldr x9, [x16, #0x48]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "add x9, x9, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v10.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v10.s }[0], [x9], #0x4\n" + "15:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v30.4s, v8.4s, v10.4s\n" + "ldr x28, [x16, #0x50]\n" + "fmla v29.4s, v7.4s, v10.4s\n" + "add x28, x28, x14\n" + "fmla v27.4s, v5.4s, v10.4s\n" + "fmla v26.4s, v4.4s, v10.4s\n" + "fmla v24.4s, v2.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v11.d }[0], [x28], #0x8\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "b 17f\n" + "16:" // Oddments: Load input (1, 0): Bit 1: Unset + "ld1 { v11.s }[0], [x28], #0x4\n" + "17:" // Oddments: Load input (1, 0): Bit 1: End + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr x27, [x16, #0x58]\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v13.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v13.s }[2], [x27], #0x4\n" + "b 19f\n" + "18:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v13.s }[0], [x27], #0x4\n" + "19:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v29.4s, v5.4s, v13.4s\n" + "ldr x26, [x16, #0x60]\n" + "fmla v26.4s, v2.4s, v13.4s\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v12.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "b 21f\n" + "20:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v12.s }[0], [x26], #0x4\n" + "21:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr x25, [x16, #0x68]\n" + "fmla v25.4s, v3.4s, v12.4s\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v10.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v10.s }[2], [x25], #0x4\n" + "b 23f\n" + "22:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v10.s }[0], [x25], #0x4\n" + "23:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v28.4s, v8.4s, v10.4s\n" + "ldr x24, [x16, #0x70]\n" + "fmla v27.4s, v7.4s, v10.4s\n" + "add x24, x24, x14\n" + "fmla v26.4s, v6.4s, v10.4s\n" + "fmla v25.4s, v5.4s, v10.4s\n" + "fmla v24.4s, v4.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x24], #0x4\n" + "25:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v26.4s, v8.4s, v11.4s\n" + "ldr x23, [x16, #0x78]\n" + "fmla v23.4s, v5.4s, v11.4s\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v13.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v13.s }[2], [x23], #0x4\n" + "b 27f\n" + "26:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v13.s }[0], [x23], #0x4\n" + "27:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v25.4s, v7.4s, v13.4s\n" + "ldr x10, [x16, #0x80]\n" + "fmla v24.4s, v6.4s, v13.4s\n" + "add x10, x10, x14\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v12.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v12.s }[2], [x10], #0x4\n" + "b 29f\n" + "28:" // Oddments: Load input (1, 1): Bit 1: Unset + "ld1 { v12.s }[0], [x10], #0x4\n" + "29:" // Oddments: Load input (1, 1): Bit 1: End + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr x9, [x16, #0x88]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "add x9, x9, x14\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v27.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v11.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "b 31f\n" + "30:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v11.s }[0], [x9], #0x4\n" + "31:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.4s, v5.4s, v11.4s\n" + "ldr x28, [x16, #0x90]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "add x28, x28, x14\n" + "fmla v27.4s, v2.4s, v11.4s\n" + "fmla v26.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v13.d }[0], [x28], #0x8\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v13.s }[2], [x28], #0x4\n" + "b 33f\n" + "32:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v13.s }[0], [x28], #0x4\n" + "33:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v24.4s, v8.4s, v13.4s\n" + "ldr x27, [x16, #0x98]\n" + "fmla v23.4s, v7.4s, v13.4s\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v12.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v12.s }[2], [x27], #0x4\n" + "b 35f\n" + "34:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v12.s }[0], [x27], #0x4\n" + "35:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v28.4s, v7.4s, v12.4s\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v27.4s, v6.4s, v12.4s\n" + "add x26, x26, x14\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "fmla v24.4s, v3.4s, v12.4s\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v11.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v11.s }[2], [x26], #0x4\n" + "b 37f\n" + "36:" // Oddments: Load input (0, 2): Bit 1: Unset + "ld1 { v11.s }[0], [x26], #0x4\n" + "37:" // Oddments: Load input (0, 2): Bit 1: End + "fmla v31.4s, v2.4s, v11.4s\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v30.4s, v1.4s, v11.4s\n" + "add x25, x25, x14\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v13.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "b 39f\n" + "38:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v13.s }[0], [x25], #0x4\n" + "39:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v27.4s, v8.4s, v13.4s\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.4s, v7.4s, v13.4s\n" + "add x24, x24, x14\n" + "fmla v24.4s, v5.4s, v13.4s\n" + "fmla v23.4s, v4.4s, v13.4s\n" + "tbz %x[n_channels], #1, 40f\n" + "ld1 { v12.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 41f\n" + "ld1 { v12.s }[2], [x24], #0x4\n" + "b 41f\n" + "40:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v12.s }[0], [x24], #0x4\n" + "41:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v12.4s\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "add x23, x23, x14\n" + "fmla v25.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 42f\n" + "ld1 { v11.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 43f\n" + "ld1 { v11.s }[2], [x23], #0x4\n" + "b 43f\n" + "42:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x23], #0x4\n" + "43:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v29.4s, v8.4s, v11.4s\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v26.4s, v5.4s, v11.4s\n" + "add x10, x10, x14\n" + "fmla v23.4s, v2.4s, v11.4s\n" + "tbz %x[n_channels], #1, 44f\n" + "ld1 { v13.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 45f\n" + "ld1 { v13.s }[2], [x10], #0x4\n" + "b 45f\n" + "44:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v13.s }[0], [x10], #0x4\n" + "45:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v25.4s, v8.4s, v13.4s\n" + "fmla v24.4s, v7.4s, v13.4s\n" + "fmla v23.4s, v6.4s, v13.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "fmax v27.4s, v27.4s, v18.4s\n" + "fmax v26.4s, v26.4s, v18.4s\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "fmin v27.4s, v27.4s, v17.4s\n" + "fmin v26.4s, v26.4s, v17.4s\n" + "fmax v25.4s, v25.4s, v18.4s\n" + "fmax v24.4s, v24.4s, v18.4s\n" + "fmax v23.4s, v23.4s, v18.4s\n" + "fmin v25.4s, v25.4s, v17.4s\n" + "fmin v24.4s, v24.4s, v17.4s\n" + "fmin v23.4s, v23.4s, v17.4s\n" + "tbz %x[n_channels], #1, 46f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.d }[0], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.d }[0], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.d }[0], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.d }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.d }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.d }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.d }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.d }[0], [x19]\n" + "add x12, x12, #0x8\n" + "st1 { v23.d }[0], [x22]\n" + "tbz %x[n_channels], #0, 47f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.s }[2], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.s }[2], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.s }[2], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.s }[2], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.s }[2], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.s }[2], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.s }[2], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.s }[2], [x19]\n" + "st1 { v23.s }[2], [x22]\n" + "b 47f\n" + "46:" // Oddments: Store: Bit 1: Unset + "ldr x22, [x17, #0x0]\n" + "add x22, x22, x12\n" + "ldr x21, [x17, #0x8]\n" + "ldr x20, [x17, #0x10]\n" + "add x21, x21, x12\n" + "st1 { v31.s }[0], [x22]\n" + "ldr x19, [x17, #0x18]\n" + "add x20, x20, x12\n" + "st1 { v30.s }[0], [x21]\n" + "add x19, x19, x12\n" + "st1 { v29.s }[0], [x20]\n" + "ldr x22, [x17, #0x20]\n" + "add x22, x22, x12\n" + "st1 { v28.s }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.s }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.s }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.s }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.s }[0], [x19]\n" + "st1 { v23.s }[0], [x22]\n" + "47:" // Oddments: Store: Bit 1: End + + "48:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp new file mode 100644 index 0000000000..84bac12429 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + + a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..616fd0d0e7 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,1229 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x4, #0x0\n" + "mov x26, #0x0\n" + "1:" // Tile loop + "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x25, #0x4\n" + "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x24, #0x4\n" + "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n" + "add x23, %x[params_struct], %[offsetof_args_min]\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x7, #0x0\n" + "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x4, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col + "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x25\n" // offset *= kernel_stride * output_size + "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1r { v15.4s }, [x23]\n" + "add x15, x8, x22, LSL #2\n" + "ld1r { v14.4s }, [x21]\n" + "add x14, x15, x22, LSL #2\n" + "lsl x6, x6, #0x2\n" + "add x13, x14, x22, LSL #2\n" + "add x12, x13, x22, LSL #2\n" + "add x11, x12, x22, LSL #2\n" + "add x10, x6, x6\n" + "add x9, x10, x6\n" + "add x28, x9, x6\n" + "add x27, x28, x6\n" + "mul x19, x4, x20\n" // offset = tile_i * ld_output_row + "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x24\n" // offset *= output_tile_size + "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x26, x16, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "lsl x17, x17, #0x2\n" + "add x23, x17, x17\n" + "add x22, x23, x17\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x2\n" + "cbz x19, 4f\n" + "ldr q13, [x5, #0x0]\n" + "ldr q0, [x5, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x5, #0x20]\n" + "ldr q2, [x5, #0x30]\n" + "ldr q3, [x5, #0x40]\n" + "ldr q4, [x5, #0x50]\n" + "ldr q5, [x5, #0x60]\n" + "ldr q6, [x5, #0x70]\n" + "ldr q7, [x5, #0x80]\n" + "ldr q8, [x5, #0x90]\n" + "add x5, x5, #0xa0\n" + "ldr q9, [x14, x10]\n" + "ld1 { v10.4s }, [x8]\n" + "ldr q11, [x8, x27]\n" + "ldr q12, [x14, x9]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "add x7, x7, #0x10\n" + "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "add x21, x21, #0x10\n" + "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n" + "cmp x21, x19, LSL #4\n" + "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n" + "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n" + "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n" + "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n" + "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n" + "ldr q9, [x13, x10]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x11]\n" + "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n" + "ldr q11, [x11, x27]\n" + "fmla v30.4s, v8.4s, v12.4s\n" + "fmla v29.4s, v7.4s, v12.4s\n" + "fmla v26.4s, v5.4s, v12.4s\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n" + "fmla v22.4s, v2.4s, v12.4s\n" + "fmla v21.4s, v1.4s, v12.4s\n" + "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n" + "ldr q12, [x8, x6]\n" + "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n" + "ldr q10, [x13, x9]\n" + "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n" + "ldr q11, [x8, x28]\n" + "fmla v27.4s, v8.4s, v9.4s\n" + "fmla v26.4s, v7.4s, v9.4s\n" + "fmla v25.4s, v6.4s, v9.4s\n" + "fmla v23.4s, v5.4s, v9.4s\n" + "fmla v22.4s, v4.4s, v9.4s\n" + "fmla v21.4s, v3.4s, v9.4s\n" + "fmla v19.4s, v2.4s, v9.4s\n" + "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n" + "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n" + "ld1 { v9.4s }, [x15]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "ldr q13, [x5, #0x0]\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x15, x27]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "fmla v28.4s, v1.4s, v11.4s\n" + "ld1 { v11.4s }, [x12]\n" + "fmla v26.4s, v8.4s, v10.4s\n" + "fmla v25.4s, v7.4s, v10.4s\n" + "fmla v24.4s, v6.4s, v10.4s\n" + "fmla v22.4s, v5.4s, v10.4s\n" + "fmla v21.4s, v4.4s, v10.4s\n" + "fmla v20.4s, v3.4s, v10.4s\n" + "fmla v18.4s, v2.4s, v10.4s\n" + "fmla v17.4s, v1.4s, v10.4s\n" + "fmla v16.4s, v0.4s, v10.4s\n" + "ldr q10, [x15, x10]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "fmla v27.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v2.4s, v12.4s\n" + "ldr q12, [x15, x9]\n" + "fmla v23.4s, v6.4s, v11.4s\n" + "fmla v19.4s, v3.4s, v11.4s\n" + "ldr q11, [x12, x27]\n" + "fmla v31.4s, v5.4s, v10.4s\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v2.4s, v10.4s\n" + "fmla v26.4s, v1.4s, v10.4s\n" + "fmla v25.4s, v0.4s, v10.4s\n" + "ldr q10, [x14, x6]\n" + "fmla v20.4s, v8.4s, v11.4s\n" + "fmla v16.4s, v5.4s, v11.4s\n" + "ldr q11, [x11, x6]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v26.4s, v2.4s, v12.4s\n" + "fmla v25.4s, v1.4s, v12.4s\n" + "fmla v24.4s, v0.4s, v12.4s\n" + "ldr q12, [x14, x28]\n" + "fmla v19.4s, v7.4s, v11.4s\n" + "fmla v18.4s, v6.4s, v11.4s\n" + "ldr q11, [x11, x28]\n" + "fmla v31.4s, v7.4s, v10.4s\n" + "fmla v30.4s, v6.4s, v10.4s\n" + "fmla v27.4s, v4.4s, v10.4s\n" + "fmla v26.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v22.4s, v0.4s, v10.4s\n" + "ldr q10, [x8, x10]\n" + "fmla v17.4s, v8.4s, v11.4s\n" + "fmla v16.4s, v7.4s, v11.4s\n" + "ldr q11, [x13, x6]\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v25.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v4.4s, v12.4s\n" + "fmla v21.4s, v2.4s, v12.4s\n" + "fmla v20.4s, v1.4s, v12.4s\n" + "ldr q12, [x8, x9]\n" + "add x8, x8, #0x10\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x14]\n" + "fmla v27.4s, v7.4s, v11.4s\n" + "fmla v26.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v4.4s, v11.4s\n" + "fmla v22.4s, v3.4s, v11.4s\n" + "fmla v19.4s, v1.4s, v11.4s\n" + "fmla v18.4s, v0.4s, v11.4s\n" + "ldr q11, [x13, x28]\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr q12, [x14, x27]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v6.4s, v10.4s\n" + "ldr q9, [x14, x10]\n" + "fmla v27.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x13]\n" + "fmla v25.4s, v8.4s, v11.4s\n" + "fmla v24.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v5.4s, v11.4s\n" + "fmla v20.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v2.4s, v11.4s\n" + "fmla v16.4s, v1.4s, v11.4s\n" + "ldr q11, [x12, x10]\n" + "fmla v28.4s, v8.4s, v12.4s\n" + "fmla v24.4s, v5.4s, v12.4s\n" + "fmla v20.4s, v2.4s, v12.4s\n" + "ldr q12, [x13, x27]\n" + "add x13, x13, #0x10\n" + "fmla v27.4s, v6.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v19.4s, v0.4s, v10.4s\n" + "ldr q10, [x11, x10]\n" + "fmla v22.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v8.4s, v11.4s\n" + "fmla v19.4s, v5.4s, v11.4s\n" + "fmla v18.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v3.4s, v11.4s\n" + "ldr q11, [x12, x9]\n" + "fmla v24.4s, v8.4s, v12.4s\n" + "fmla v20.4s, v5.4s, v12.4s\n" + "fmla v16.4s, v2.4s, v12.4s\n" + "ldr q12, [x11, x9]\n" + "add x11, x11, #0x10\n" + "fmla v19.4s, v8.4s, v10.4s\n" + "fmla v18.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v6.4s, v10.4s\n" + "ldr q10, [x15, x6]\n" + "fmla v22.4s, v8.4s, v11.4s\n" + "fmla v21.4s, v7.4s, v11.4s\n" + "fmla v20.4s, v6.4s, v11.4s\n" + "fmla v18.4s, v5.4s, v11.4s\n" + "fmla v17.4s, v4.4s, v11.4s\n" + "fmla v16.4s, v3.4s, v11.4s\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v18.4s, v8.4s, v12.4s\n" + "fmla v31.4s, v4.4s, v10.4s\n" + "fmla v17.4s, v7.4s, v12.4s\n" + "fmla v16.4s, v6.4s, v12.4s\n" + "ldr q12, [x12, x6]\n" + "fmla v30.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v1.4s, v10.4s\n" + "fmla v26.4s, v0.4s, v10.4s\n" + "ldr q10, [x12, x28]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v5.4s, v11.4s\n" + "ldr q0, [x5, #0x10]\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v25.4s, v2.4s, v11.4s\n" + "ldr q2, [x5, #0x30]\n" + "fmla v24.4s, v1.4s, v11.4s\n" + "ldr q11, [x8, x27]\n" + "fmla v23.4s, v7.4s, v12.4s\n" + "ldr q1, [x5, #0x20]\n" + "fmla v22.4s, v6.4s, v12.4s\n" + "ldr q6, [x5, #0x70]\n" + "fmla v19.4s, v4.4s, v12.4s\n" + "fmla v18.4s, v3.4s, v12.4s\n" + "ldr q12, [x14, x9]\n" + "fmla v21.4s, v8.4s, v10.4s\n" + "ldr q3, [x5, #0x40]\n" + "fmla v20.4s, v7.4s, v10.4s\n" + "ldr q7, [x5, #0x80]\n" + "fmla v17.4s, v5.4s, v10.4s\n" + "ldr q5, [x5, #0x60]\n" + "fmla v16.4s, v4.4s, v10.4s\n" + "ld1 { v10.4s }, [x8]\n" + "fmax v31.4s, v31.4s, v15.4s\n" + "ldr q4, [x5, #0x50]\n" + "fmax v30.4s, v30.4s, v15.4s\n" + "ldr q8, [x5, #0x90]\n" + "add x5, x5, #0xa0\n" + "fmin v31.4s, v31.4s, v14.4s\n" + "st1 { v31.4s }, [x16]\n" + "fmin v30.4s, v30.4s, v14.4s\n" + "fmax v29.4s, v29.4s, v15.4s\n" + "str q30, [x16, x17]\n" + "fmin v29.4s, v29.4s, v14.4s\n" + "fmax v28.4s, v28.4s, v15.4s\n" + "str q29, [x16, x23]\n" + "fmin v28.4s, v28.4s, v14.4s\n" + "fmax v27.4s, v27.4s, v15.4s\n" + "str q28, [x16, x22]\n" + "fmin v27.4s, v27.4s, v14.4s\n" + "add x16, x16, #0x10\n" + "fmax v26.4s, v26.4s, v15.4s\n" + "st1 { v27.4s }, [x26]\n" + "fmax v25.4s, v25.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v15.4s\n" + "fmin v26.4s, v26.4s, v14.4s\n" + "str q26, [x26, x17]\n" + "fmin v25.4s, v25.4s, v14.4s\n" + "fmin v24.4s, v24.4s, v14.4s\n" + "str q25, [x26, x23]\n" + "fmax v23.4s, v23.4s, v15.4s\n" + "fmax v22.4s, v22.4s, v15.4s\n" + "str q24, [x26, x22]\n" + "add x26, x26, #0x10\n" + "fmax v21.4s, v21.4s, v15.4s\n" + "fmax v20.4s, v20.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v14.4s\n" + "st1 { v23.4s }, [x25]\n" + "fmin v22.4s, v22.4s, v14.4s\n" + "fmin v21.4s, v21.4s, v14.4s\n" + "str q22, [x25, x17]\n" + "fmin v20.4s, v20.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v15.4s\n" + "str q21, [x25, x23]\n" + "fmax v18.4s, v18.4s, v15.4s\n" + "str q20, [x25, x22]\n" + "fmin v19.4s, v19.4s, v14.4s\n" + "add x25, x25, #0x10\n" + "fmin v18.4s, v18.4s, v14.4s\n" + "st1 { v19.4s }, [x24]\n" + "fmax v17.4s, v17.4s, v15.4s\n" + "fmax v16.4s, v16.4s, v15.4s\n" + "str q18, [x24, x17]\n" + "fmin v17.4s, v17.4s, v14.4s\n" + "str q17, [x24, x23]\n" + "fmin v16.4s, v16.4s, v14.4s\n" + "str q16, [x24, x22]\n" + "add x24, x24, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n" + "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n" + "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n" + "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n" + "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n" + "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n" + "ldr q9, [x13, x10]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x11]\n" + "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n" + "ldr q11, [x11, x27]\n" + "fmla v30.4s, v8.4s, v12.4s\n" + "fmla v29.4s, v7.4s, v12.4s\n" + "fmla v26.4s, v5.4s, v12.4s\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n" + "fmla v22.4s, v2.4s, v12.4s\n" + "fmla v21.4s, v1.4s, v12.4s\n" + "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n" + "ldr q12, [x8, x6]\n" + "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n" + "ldr q10, [x13, x9]\n" + "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n" + "ldr q11, [x8, x28]\n" + "fmla v27.4s, v8.4s, v9.4s\n" + "fmla v26.4s, v7.4s, v9.4s\n" + "fmla v25.4s, v6.4s, v9.4s\n" + "fmla v23.4s, v5.4s, v9.4s\n" + "fmla v22.4s, v4.4s, v9.4s\n" + "fmla v21.4s, v3.4s, v9.4s\n" + "fmla v19.4s, v2.4s, v9.4s\n" + "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n" + "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n" + "ld1 { v9.4s }, [x15]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x15, x27]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "fmla v28.4s, v1.4s, v11.4s\n" + "ld1 { v11.4s }, [x12]\n" + "fmla v26.4s, v8.4s, v10.4s\n" + "fmla v25.4s, v7.4s, v10.4s\n" + "fmla v24.4s, v6.4s, v10.4s\n" + "fmla v22.4s, v5.4s, v10.4s\n" + "fmla v21.4s, v4.4s, v10.4s\n" + "fmla v20.4s, v3.4s, v10.4s\n" + "fmla v18.4s, v2.4s, v10.4s\n" + "fmla v17.4s, v1.4s, v10.4s\n" + "fmla v16.4s, v0.4s, v10.4s\n" + "ldr q10, [x15, x10]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "fmla v27.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v2.4s, v12.4s\n" + "ldr q12, [x15, x9]\n" + "fmla v23.4s, v6.4s, v11.4s\n" + "fmla v19.4s, v3.4s, v11.4s\n" + "ldr q11, [x12, x27]\n" + "fmla v31.4s, v5.4s, v10.4s\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v2.4s, v10.4s\n" + "fmla v26.4s, v1.4s, v10.4s\n" + "fmla v25.4s, v0.4s, v10.4s\n" + "ldr q10, [x14, x6]\n" + "fmla v20.4s, v8.4s, v11.4s\n" + "fmla v16.4s, v5.4s, v11.4s\n" + "ldr q11, [x11, x6]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v26.4s, v2.4s, v12.4s\n" + "fmla v25.4s, v1.4s, v12.4s\n" + "fmla v24.4s, v0.4s, v12.4s\n" + "ldr q12, [x14, x28]\n" + "fmla v19.4s, v7.4s, v11.4s\n" + "fmla v18.4s, v6.4s, v11.4s\n" + "ldr q11, [x11, x28]\n" + "fmla v31.4s, v7.4s, v10.4s\n" + "fmla v30.4s, v6.4s, v10.4s\n" + "fmla v27.4s, v4.4s, v10.4s\n" + "fmla v26.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v22.4s, v0.4s, v10.4s\n" + "ldr q10, [x8, x10]\n" + "fmla v17.4s, v8.4s, v11.4s\n" + "fmla v16.4s, v7.4s, v11.4s\n" + "ldr q11, [x13, x6]\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v25.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v4.4s, v12.4s\n" + "fmla v21.4s, v2.4s, v12.4s\n" + "fmla v20.4s, v1.4s, v12.4s\n" + "ldr q12, [x8, x9]\n" + "add x8, x8, #0x10\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x14]\n" + "fmla v27.4s, v7.4s, v11.4s\n" + "fmla v26.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v4.4s, v11.4s\n" + "fmla v22.4s, v3.4s, v11.4s\n" + "fmla v19.4s, v1.4s, v11.4s\n" + "fmla v18.4s, v0.4s, v11.4s\n" + "ldr q11, [x13, x28]\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr q12, [x14, x27]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v6.4s, v10.4s\n" + "fmla v27.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x13]\n" + "fmla v25.4s, v8.4s, v11.4s\n" + "fmla v24.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v5.4s, v11.4s\n" + "fmla v20.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v2.4s, v11.4s\n" + "fmla v16.4s, v1.4s, v11.4s\n" + "ldr q11, [x12, x10]\n" + "fmla v28.4s, v8.4s, v12.4s\n" + "fmla v24.4s, v5.4s, v12.4s\n" + "fmla v20.4s, v2.4s, v12.4s\n" + "ldr q12, [x13, x27]\n" + "add x13, x13, #0x10\n" + "fmla v27.4s, v6.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v19.4s, v0.4s, v10.4s\n" + "ldr q10, [x11, x10]\n" + "fmla v22.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v8.4s, v11.4s\n" + "fmla v19.4s, v5.4s, v11.4s\n" + "fmla v18.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v3.4s, v11.4s\n" + "ldr q11, [x12, x9]\n" + "fmla v24.4s, v8.4s, v12.4s\n" + "fmla v20.4s, v5.4s, v12.4s\n" + "fmla v16.4s, v2.4s, v12.4s\n" + "ldr q12, [x11, x9]\n" + "add x11, x11, #0x10\n" + "fmla v19.4s, v8.4s, v10.4s\n" + "fmla v18.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v6.4s, v10.4s\n" + "ldr q10, [x15, x6]\n" + "fmla v22.4s, v8.4s, v11.4s\n" + "fmla v21.4s, v7.4s, v11.4s\n" + "fmla v20.4s, v6.4s, v11.4s\n" + "fmla v18.4s, v5.4s, v11.4s\n" + "fmla v17.4s, v4.4s, v11.4s\n" + "fmla v16.4s, v3.4s, v11.4s\n" + "ldr q11, [x15, x28]\n" + "add x15, x15, #0x10\n" + "fmla v18.4s, v8.4s, v12.4s\n" + "fmla v31.4s, v4.4s, v10.4s\n" + "fmla v17.4s, v7.4s, v12.4s\n" + "fmla v16.4s, v6.4s, v12.4s\n" + "ldr q12, [x12, x6]\n" + "fmla v30.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v1.4s, v10.4s\n" + "fmla v26.4s, v0.4s, v10.4s\n" + "ldr q10, [x12, x28]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v5.4s, v11.4s\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v25.4s, v2.4s, v11.4s\n" + "fmla v24.4s, v1.4s, v11.4s\n" + "fmla v23.4s, v7.4s, v12.4s\n" + "fmla v22.4s, v6.4s, v12.4s\n" + "fmla v19.4s, v4.4s, v12.4s\n" + "fmla v18.4s, v3.4s, v12.4s\n" + "fmla v21.4s, v8.4s, v10.4s\n" + "fmla v20.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v5.4s, v10.4s\n" + "fmla v16.4s, v4.4s, v10.4s\n" + "fmax v31.4s, v31.4s, v15.4s\n" + "fmax v30.4s, v30.4s, v15.4s\n" + "fmax v29.4s, v29.4s, v15.4s\n" + "fmin v31.4s, v31.4s, v14.4s\n" + "st1 { v31.4s }, [x16]\n" + "fmin v30.4s, v30.4s, v14.4s\n" + "fmin v29.4s, v29.4s, v14.4s\n" + "str q30, [x16, x17]\n" + "fmax v28.4s, v28.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v15.4s\n" + "str q29, [x16, x23]\n" + "fmax v26.4s, v26.4s, v15.4s\n" + "fmax v25.4s, v25.4s, v15.4s\n" + "fmin v28.4s, v28.4s, v14.4s\n" + "str q28, [x16, x22]\n" + "fmin v27.4s, v27.4s, v14.4s\n" + "add x16, x16, #0x10\n" + "fmin v26.4s, v26.4s, v14.4s\n" + "st1 { v27.4s }, [x26]\n" + "fmin v25.4s, v25.4s, v14.4s\n" + "fmax v24.4s, v24.4s, v15.4s\n" + "str q26, [x26, x17]\n" + "fmax v23.4s, v23.4s, v15.4s\n" + "str q25, [x26, x23]\n" + "fmin v24.4s, v24.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v15.4s\n" + "str q24, [x26, x22]\n" + "fmin v23.4s, v23.4s, v14.4s\n" + "add x26, x26, #0x10\n" + "fmin v22.4s, v22.4s, v14.4s\n" + "st1 { v23.4s }, [x25]\n" + "fmax v21.4s, v21.4s, v15.4s\n" + "fmax v20.4s, v20.4s, v15.4s\n" + "str q22, [x25, x17]\n" + "fmax v19.4s, v19.4s, v15.4s\n" + "fmax v18.4s, v18.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v14.4s\n" + "str q21, [x25, x23]\n" + "fmin v20.4s, v20.4s, v14.4s\n" + "fmin v19.4s, v19.4s, v14.4s\n" + "str q20, [x25, x22]\n" + "fmin v18.4s, v18.4s, v14.4s\n" + "add x25, x25, #0x10\n" + "fmax v17.4s, v17.4s, v15.4s\n" + "st1 { v19.4s }, [x24]\n" + "fmax v16.4s, v16.4s, v15.4s\n" + "str q18, [x24, x17]\n" + "fmin v17.4s, v17.4s, v14.4s\n" + "str q17, [x24, x23]\n" + "fmin v16.4s, v16.4s, v14.4s\n" + "str q16, [x24, x22]\n" + "add x24, x24, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x3\n" + "beq 73f\n" + "ldr q13, [x5, #0x0]\n" + "ldr q0, [x5, #0x10]\n" + "add x22, x14, x10\n" + "ldr q1, [x5, #0x20]\n" + "add x21, x8, XZR\n" + "ldr q2, [x5, #0x30]\n" + "add x20, x8, x27\n" + "ldr q3, [x5, #0x40]\n" + "add x19, x14, x9\n" + "ldr q4, [x5, #0x50]\n" + "ldr q5, [x5, #0x60]\n" + "ldr q6, [x5, #0x70]\n" + "ldr q7, [x5, #0x80]\n" + "ldr q8, [x5, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr d9, [x22], #0x8\n" + "ldr d10, [x21], #0x8\n" + "ldr d11, [x20], #0x8\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.s }[2], [x22]\n" + "ld1 { v10.s }[2], [x21]\n" + "ld1 { v11.s }[2], [x20]\n" + "ld1 { v12.s }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset + "ldr s9, [x22, #0x0]\n" + "ldr s10, [x21, #0x0]\n" + "ldr s11, [x20, #0x0]\n" + "ldr s12, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End + "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x19, x11, XZR\n" + "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n" + "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n" + "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n" + "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n" + "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n" + "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n" + "fmla v30.4s, v8.4s, v12.4s\n" + "fmla v29.4s, v7.4s, v12.4s\n" + "fmla v26.4s, v5.4s, v12.4s\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n" + "fmla v22.4s, v2.4s, v12.4s\n" + "fmla v21.4s, v1.4s, v12.4s\n" + "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End + "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n" + "add x19, x11, x27\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End + "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n" + "add x19, x13, x10\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v27.4s, v8.4s, v9.4s\n" + "add x19, x8, x6\n" + "fmla v26.4s, v7.4s, v9.4s\n" + "fmla v25.4s, v6.4s, v9.4s\n" + "fmla v23.4s, v5.4s, v9.4s\n" + "fmla v22.4s, v4.4s, v9.4s\n" + "fmla v21.4s, v3.4s, v9.4s\n" + "fmla v19.4s, v2.4s, v9.4s\n" + "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n" + "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End + "fmla v31.4s, v1.4s, v12.4s\n" + "add x19, x8, x28\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End + "fmla v29.4s, v2.4s, v11.4s\n" + "add x19, x13, x9\n" + "fmla v28.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v26.4s, v8.4s, v10.4s\n" + "add x19, x15, XZR\n" + "fmla v25.4s, v7.4s, v10.4s\n" + "fmla v24.4s, v6.4s, v10.4s\n" + "fmla v22.4s, v5.4s, v10.4s\n" + "fmla v21.4s, v4.4s, v10.4s\n" + "fmla v20.4s, v3.4s, v10.4s\n" + "fmla v18.4s, v2.4s, v10.4s\n" + "fmla v17.4s, v1.4s, v10.4s\n" + "fmla v16.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End + "fmla v31.4s, v3.4s, v9.4s\n" + "add x19, x15, x27\n" + "fmla v27.4s, v0.4s, v9.4s\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End + "fmla v28.4s, v5.4s, v12.4s\n" + "add x19, x12, XZR\n" + "fmla v24.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End + "fmla v23.4s, v6.4s, v11.4s\n" + "add x19, x15, x10\n" + "fmla v19.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End + "fmla v31.4s, v5.4s, v10.4s\n" + "add x19, x12, x27\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v2.4s, v10.4s\n" + "fmla v26.4s, v1.4s, v10.4s\n" + "fmla v25.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End + "fmla v20.4s, v8.4s, v11.4s\n" + "add x19, x15, x9\n" + "fmla v16.4s, v5.4s, v11.4s\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.4s, v5.4s, v12.4s\n" + "add x19, x11, x6\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v26.4s, v2.4s, v12.4s\n" + "fmla v25.4s, v1.4s, v12.4s\n" + "fmla v24.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End + "fmla v19.4s, v7.4s, v11.4s\n" + "add x19, x14, x6\n" + "fmla v18.4s, v6.4s, v11.4s\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v31.4s, v7.4s, v10.4s\n" + "add x19, x11, x28\n" + "fmla v30.4s, v6.4s, v10.4s\n" + "fmla v27.4s, v4.4s, v10.4s\n" + "fmla v26.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v22.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End + "fmla v17.4s, v8.4s, v11.4s\n" + "add x19, x14, x28\n" + "fmla v16.4s, v7.4s, v11.4s\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v29.4s, v8.4s, v12.4s\n" + "add x19, x8, x10\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v25.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v4.4s, v12.4s\n" + "fmla v21.4s, v2.4s, v12.4s\n" + "fmla v20.4s, v1.4s, v12.4s\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End + "fmla v31.4s, v2.4s, v10.4s\n" + "add x19, x13, x6\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 41f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 42f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v27.4s, v7.4s, v11.4s\n" + "add x19, x8, x9\n" + "fmla v26.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v4.4s, v11.4s\n" + "fmla v22.4s, v3.4s, v11.4s\n" + "fmla v19.4s, v1.4s, v11.4s\n" + "fmla v18.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 43f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 44f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 44f\n" + "43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End + "fmla v30.4s, v2.4s, v12.4s\n" + "add x19, x14, XZR\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 45f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 46f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 46f\n" + "45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v10.4s\n" + "add x19, x13, x28\n" + "fmla v27.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 47f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 48f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 48f\n" + "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v25.4s, v8.4s, v11.4s\n" + "add x19, x14, x27\n" + "fmla v24.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v5.4s, v11.4s\n" + "fmla v20.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v2.4s, v11.4s\n" + "fmla v16.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 49f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 50f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 50f\n" + "49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End + "fmla v28.4s, v8.4s, v12.4s\n" + "add x19, x13, XZR\n" + "fmla v24.4s, v5.4s, v12.4s\n" + "fmla v20.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 51f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 52f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 52f\n" + "51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v27.4s, v6.4s, v10.4s\n" + "add x19, x12, x10\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v19.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 53f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 54f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 54f\n" + "53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v23.4s, v8.4s, v11.4s\n" + "add x19, x13, x27\n" + "fmla v22.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v6.4s, v11.4s\n" + "fmla v19.4s, v5.4s, v11.4s\n" + "fmla v18.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 55f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 56f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 56f\n" + "55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End + "fmla v24.4s, v8.4s, v12.4s\n" + "add x19, x11, x10\n" + "fmla v20.4s, v5.4s, v12.4s\n" + "fmla v16.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 57f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 58f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 58f\n" + "57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End + "fmla v19.4s, v8.4s, v10.4s\n" + "add x19, x12, x9\n" + "fmla v18.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v6.4s, v10.4s\n" + "tbz %x[n_channels], #1, 59f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 60f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 60f\n" + "59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v22.4s, v8.4s, v11.4s\n" + "add x19, x11, x9\n" + "fmla v21.4s, v7.4s, v11.4s\n" + "fmla v20.4s, v6.4s, v11.4s\n" + "fmla v18.4s, v5.4s, v11.4s\n" + "fmla v17.4s, v4.4s, v11.4s\n" + "fmla v16.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 61f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 62f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 62f\n" + "61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End + "fmla v18.4s, v8.4s, v12.4s\n" + "add x19, x15, x6\n" + "fmla v17.4s, v7.4s, v12.4s\n" + "fmla v16.4s, v6.4s, v12.4s\n" + "tbz %x[n_channels], #1, 63f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 64f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 64f\n" + "63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End + "fmla v31.4s, v4.4s, v10.4s\n" + "add x19, x15, x28\n" + "fmla v30.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v1.4s, v10.4s\n" + "fmla v26.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 65f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 66f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 66f\n" + "65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v29.4s, v5.4s, v11.4s\n" + "add x19, x12, x6\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v25.4s, v2.4s, v11.4s\n" + "fmla v24.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 67f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 68f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 68f\n" + "67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v23.4s, v7.4s, v12.4s\n" + "add x19, x12, x28\n" + "fmla v22.4s, v6.4s, v12.4s\n" + "fmla v19.4s, v4.4s, v12.4s\n" + "fmla v18.4s, v3.4s, v12.4s\n" + "tbz %x[n_channels], #1, 69f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 70f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 70f\n" + "69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v21.4s, v8.4s, v10.4s\n" + "fmla v20.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v5.4s, v10.4s\n" + "fmla v16.4s, v4.4s, v10.4s\n" + "fmax v31.4s, v31.4s, v15.4s\n" + "fmax v30.4s, v30.4s, v15.4s\n" + "fmax v29.4s, v29.4s, v15.4s\n" + "fmin v31.4s, v31.4s, v14.4s\n" + "fmin v30.4s, v30.4s, v14.4s\n" + "fmin v29.4s, v29.4s, v14.4s\n" + "fmax v28.4s, v28.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v15.4s\n" + "fmax v26.4s, v26.4s, v15.4s\n" + "fmin v28.4s, v28.4s, v14.4s\n" + "fmin v27.4s, v27.4s, v14.4s\n" + "fmin v26.4s, v26.4s, v14.4s\n" + "fmax v25.4s, v25.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v15.4s\n" + "fmax v23.4s, v23.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v14.4s\n" + "fmin v24.4s, v24.4s, v14.4s\n" + "fmin v23.4s, v23.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v15.4s\n" + "fmax v21.4s, v21.4s, v15.4s\n" + "fmax v20.4s, v20.4s, v15.4s\n" + "fmin v22.4s, v22.4s, v14.4s\n" + "fmin v21.4s, v21.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v15.4s\n" + "fmax v18.4s, v18.4s, v15.4s\n" + "fmax v17.4s, v17.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v14.4s\n" + "fmin v18.4s, v18.4s, v14.4s\n" + "fmin v17.4s, v17.4s, v14.4s\n" + "fmax v16.4s, v16.4s, v15.4s\n" + "fmin v16.4s, v16.4s, v14.4s\n" + "tbz %x[n_channels], #1, 71f\n" + "mov x19, x16\n" + "st1 { v31.d }[0], [x19], x17\n" + "add x16, x16, #0x8\n" + "st1 { v30.d }[0], [x19], x17\n" + "mov x21, x26\n" + "st1 { v29.d }[0], [x19], x17\n" + "st1 { v27.d }[0], [x21], x17\n" + "add x26, x26, #0x8\n" + "st1 { v28.d }[0], [x19]\n" + "mov x20, x25\n" + "st1 { v26.d }[0], [x21], x17\n" + "add x25, x25, #0x8\n" + "st1 { v25.d }[0], [x21], x17\n" + "mov x19, x24\n" + "st1 { v24.d }[0], [x21]\n" + "add x24, x24, #0x8\n" + "st1 { v23.d }[0], [x20], x17\n" + "st1 { v22.d }[0], [x20], x17\n" + "st1 { v21.d }[0], [x20], x17\n" + "st1 { v20.d }[0], [x20]\n" + "st1 { v19.d }[0], [x19], x17\n" + "st1 { v18.d }[0], [x19], x17\n" + "st1 { v17.d }[0], [x19], x17\n" + "st1 { v16.d }[0], [x19]\n" + "tbz %x[n_channels], #0, 72f\n" + "mov x22, x16\n" + "st1 { v31.s }[2], [x22], x17\n" + "mov x21, x26\n" + "st1 { v30.s }[2], [x22], x17\n" + "st1 { v27.s }[2], [x21], x17\n" + "mov x20, x25\n" + "st1 { v29.s }[2], [x22], x17\n" + "mov x19, x24\n" + "st1 { v28.s }[2], [x22]\n" + "st1 { v26.s }[2], [x21], x17\n" + "st1 { v25.s }[2], [x21], x17\n" + "st1 { v24.s }[2], [x21]\n" + "st1 { v23.s }[2], [x20], x17\n" + "st1 { v22.s }[2], [x20], x17\n" + "st1 { v21.s }[2], [x20], x17\n" + "st1 { v20.s }[2], [x20]\n" + "st1 { v19.s }[2], [x19], x17\n" + "st1 { v18.s }[2], [x19], x17\n" + "st1 { v17.s }[2], [x19], x17\n" + "st1 { v16.s }[2], [x19]\n" + "b 72f\n" + "71:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x22, x16\n" + "st1 { v31.s }[0], [x22], x17\n" + "mov x21, x26\n" + "mov x20, x25\n" + "st1 { v30.s }[0], [x22], x17\n" + "st1 { v27.s }[0], [x21], x17\n" + "mov x19, x24\n" + "st1 { v29.s }[0], [x22], x17\n" + "st1 { v28.s }[0], [x22]\n" + "st1 { v26.s }[0], [x21], x17\n" + "st1 { v25.s }[0], [x21], x17\n" + "st1 { v24.s }[0], [x21]\n" + "st1 { v23.s }[0], [x20], x17\n" + "st1 { v22.s }[0], [x20], x17\n" + "st1 { v21.s }[0], [x20], x17\n" + "st1 { v20.s }[0], [x20]\n" + "st1 { v19.s }[0], [x19], x17\n" + "st1 { v18.s }[0], [x19], x17\n" + "st1 { v17.s }[0], [x19], x17\n" + "st1 { v16.s }[0], [x19]\n" + "72:" // Tile loop: Oddments: Store: Bit 1: End + + "73:" // Tile loop: End + "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x4, #0x1\n" + "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x26, x26, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x26, x19\n" + "csel x26, x26, XZR, LT\n" + "csel x4, x4, x21, LT\n" + "cmp x4, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..51a5679bff --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,1395 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[36]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[14]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[5]; + inptrs[3] = input_ptrs[15]; + inptrs[4] = input_ptrs[30]; + inptrs[5] = input_ptrs[35]; + inptrs[6] = input_ptrs[20]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[4]; + inptrs[9] = input_ptrs[21]; + inptrs[10] = input_ptrs[6]; + inptrs[11] = input_ptrs[11]; + inptrs[12] = input_ptrs[24]; + inptrs[13] = input_ptrs[8]; + inptrs[14] = input_ptrs[29]; + inptrs[15] = input_ptrs[9]; + inptrs[16] = input_ptrs[31]; + inptrs[17] = input_ptrs[13]; + inptrs[18] = input_ptrs[34]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[19]; + inptrs[22] = input_ptrs[3]; + inptrs[23] = input_ptrs[12]; + inptrs[24] = input_ptrs[22]; + inptrs[25] = input_ptrs[17]; + inptrs[26] = input_ptrs[18]; + inptrs[27] = input_ptrs[26]; + inptrs[28] = input_ptrs[23]; + inptrs[29] = input_ptrs[32]; + inptrs[30] = input_ptrs[27]; + inptrs[31] = input_ptrs[33]; + inptrs[32] = input_ptrs[7]; + inptrs[33] = input_ptrs[10]; + inptrs[34] = input_ptrs[25]; + inptrs[35] = input_ptrs[28]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v15.4s }, [x20]\n" + "ld1r { v14.4s }, [x19]\n" + "mov x14, #0x0\n" + "mov x13, #0x10\n" // cntb _, ALL, #1 + "sub x12, XZR, x13\n" + "lsr x11, %x[n_channels], #0x2\n" + "cbz x11, 3f\n" + "ldr q13, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x13, x11, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x10, x9, [x16, #0x0]\n" + "ldp x28, x27, [x16, #0x10]\n" + "ldr q9, [x10, x14]\n" + "ldr q10, [x9, x14]\n" + "ldr q11, [x28, x14]\n" + "ldr q12, [x27, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x26, [x16, #0x20]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "ldr x25, [x16, #0x28]\n" + "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "ldr x24, [x16, #0x30]\n" + "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n" + "ldr x23, [x16, #0x38]\n" + "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n" + "ldr x10, [x16, #0x40]\n" + "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n" + "ldr x9, [x16, #0x48]\n" + "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n" + "ldr x28, [x16, #0x50]\n" + "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n" + "ldr x27, [x16, #0x58]\n" + "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n" + "ldr q9, [x24, x14]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x26, x14]\n" + "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n" + "ldr q11, [x25, x14]\n" + "fmla v30.4s, v8.4s, v12.4s\n" + "ldr x26, [x16, #0x60]\n" + "fmla v29.4s, v7.4s, v12.4s\n" + "ldr x25, [x16, #0x68]\n" + "fmla v26.4s, v5.4s, v12.4s\n" + "ldr x24, [x16, #0x70]\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr x22, [x17, #0x0]\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "ldr x21, [x17, #0x8]\n" + "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n" + "ldr x20, [x17, #0x10]\n" + "fmla v22.4s, v2.4s, v12.4s\n" + "ldr x19, [x17, #0x18]\n" + "fmla v21.4s, v1.4s, v12.4s\n" + "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n" + "ldr q10, [x9, x14]\n" + "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n" + "ldr q11, [x10, x14]\n" + "fmla v27.4s, v8.4s, v9.4s\n" + "ldr x23, [x16, #0x78]\n" + "fmla v26.4s, v7.4s, v9.4s\n" + "ldr x10, [x16, #0x80]\n" + "fmla v25.4s, v6.4s, v9.4s\n" + "ldr x9, [x16, #0x88]\n" + "fmla v23.4s, v5.4s, v9.4s\n" + "fmla v22.4s, v4.4s, v9.4s\n" + "fmla v21.4s, v3.4s, v9.4s\n" + "fmla v19.4s, v2.4s, v9.4s\n" + "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n" + "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n" + "ldr q9, [x28, x14]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x27, x14]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr x27, [x16, #0x98]\n" + "fmla v28.4s, v1.4s, v11.4s\n" + "ldr q11, [x26, x14]\n" + "fmla v26.4s, v8.4s, v10.4s\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v25.4s, v7.4s, v10.4s\n" + "ldr q13, [x15, #0x0]\n" + "fmla v24.4s, v6.4s, v10.4s\n" + "fmla v22.4s, v5.4s, v10.4s\n" + "fmla v21.4s, v4.4s, v10.4s\n" + "fmla v20.4s, v3.4s, v10.4s\n" + "fmla v18.4s, v2.4s, v10.4s\n" + "fmla v17.4s, v1.4s, v10.4s\n" + "fmla v16.4s, v0.4s, v10.4s\n" + "ldr q10, [x25, x14]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v27.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v2.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "fmla v23.4s, v6.4s, v11.4s\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v19.4s, v3.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v31.4s, v5.4s, v10.4s\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v2.4s, v10.4s\n" + "fmla v26.4s, v1.4s, v10.4s\n" + "fmla v25.4s, v0.4s, v10.4s\n" + "ldr q10, [x9, x14]\n" + "fmla v20.4s, v8.4s, v11.4s\n" + "ldr x9, [x16, #0xc8]\n" + "fmla v16.4s, v5.4s, v11.4s\n" + "ldr q11, [x10, x14]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v26.4s, v2.4s, v12.4s\n" + "fmla v25.4s, v1.4s, v12.4s\n" + "fmla v24.4s, v0.4s, v12.4s\n" + "ldr q12, [x27, x14]\n" + "fmla v19.4s, v7.4s, v11.4s\n" + "ldr x27, [x16, #0xd8]\n" + "fmla v18.4s, v6.4s, v11.4s\n" + "ldr q11, [x28, x14]\n" + "fmla v31.4s, v7.4s, v10.4s\n" + "ldr x28, [x16, #0xd0]\n" + "fmla v30.4s, v6.4s, v10.4s\n" + "fmla v27.4s, v4.4s, v10.4s\n" + "fmla v26.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v22.4s, v0.4s, v10.4s\n" + "ldr q10, [x26, x14]\n" + "fmla v17.4s, v8.4s, v11.4s\n" + "ldr x26, [x16, #0xe0]\n" + "fmla v16.4s, v7.4s, v11.4s\n" + "ldr q11, [x25, x14]\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "ldr x25, [x16, #0xe8]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v25.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v4.4s, v12.4s\n" + "fmla v21.4s, v2.4s, v12.4s\n" + "fmla v20.4s, v1.4s, v12.4s\n" + "ldr q12, [x24, x14]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr x24, [x16, #0xf0]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v0.4s, v10.4s\n" + "ldr q10, [x23, x14]\n" + "fmla v27.4s, v7.4s, v11.4s\n" + "ldr x23, [x16, #0xf8]\n" + "fmla v26.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v4.4s, v11.4s\n" + "fmla v22.4s, v3.4s, v11.4s\n" + "fmla v19.4s, v1.4s, v11.4s\n" + "fmla v18.4s, v0.4s, v11.4s\n" + "ldr q11, [x10, x14]\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "ldr x10, [x16, #0x100]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr q12, [x9, x14]\n" + "fmla v31.4s, v6.4s, v10.4s\n" + "ldr x9, [x16, #0x108]\n" + "fmla v27.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v0.4s, v10.4s\n" + "ldr q10, [x28, x14]\n" + "fmla v25.4s, v8.4s, v11.4s\n" + "ldr x28, [x16, #0x110]\n" + "fmla v24.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v5.4s, v11.4s\n" + "fmla v20.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v2.4s, v11.4s\n" + "fmla v16.4s, v1.4s, v11.4s\n" + "ldr q11, [x27, x14]\n" + "fmla v28.4s, v8.4s, v12.4s\n" + "ldr x27, [x16, #0x118]\n" + "fmla v24.4s, v5.4s, v12.4s\n" + "fmla v20.4s, v2.4s, v12.4s\n" + "ldr q12, [x26, x14]\n" + "fmla v27.4s, v6.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v19.4s, v0.4s, v10.4s\n" + "ldr q10, [x25, x14]\n" + "fmla v22.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v8.4s, v11.4s\n" + "fmla v19.4s, v5.4s, v11.4s\n" + "fmla v18.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v3.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v24.4s, v8.4s, v12.4s\n" + "fmla v20.4s, v5.4s, v12.4s\n" + "fmla v16.4s, v2.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "fmla v19.4s, v8.4s, v10.4s\n" + "fmla v18.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v6.4s, v10.4s\n" + "ldr q10, [x10, x14]\n" + "fmla v22.4s, v8.4s, v11.4s\n" + "fmla v21.4s, v7.4s, v11.4s\n" + "fmla v20.4s, v6.4s, v11.4s\n" + "fmla v18.4s, v5.4s, v11.4s\n" + "fmla v17.4s, v4.4s, v11.4s\n" + "fmla v16.4s, v3.4s, v11.4s\n" + "ldr q11, [x9, x14]\n" + "fmla v31.4s, v4.4s, v10.4s\n" + "ldp x10, x9, [x16, #0x0]\n" + "fmla v18.4s, v8.4s, v12.4s\n" + "ldr q9, [x10, x13]\n" + "fmla v17.4s, v7.4s, v12.4s\n" + "fmla v16.4s, v6.4s, v12.4s\n" + "ldr q12, [x28, x14]\n" + "fmla v30.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v1.4s, v10.4s\n" + "fmla v26.4s, v0.4s, v10.4s\n" + "ldr q10, [x27, x14]\n" + "add x14, x14, #0x10\n" + "fmla v29.4s, v5.4s, v11.4s\n" + "ldp x28, x27, [x16, #0x10]\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "ldr q0, [x15, #0x10]\n" + "fmla v25.4s, v2.4s, v11.4s\n" + "ldr q2, [x15, #0x30]\n" + "fmla v24.4s, v1.4s, v11.4s\n" + "ldr q11, [x28, x13]\n" + "fmla v23.4s, v7.4s, v12.4s\n" + "ldr q1, [x15, #0x20]\n" + "fmla v22.4s, v6.4s, v12.4s\n" + "ldr q6, [x15, #0x70]\n" + "fmla v19.4s, v4.4s, v12.4s\n" + "fmla v18.4s, v3.4s, v12.4s\n" + "ldr q12, [x27, x13]\n" + "fmla v21.4s, v8.4s, v10.4s\n" + "ldr q3, [x15, #0x40]\n" + "fmla v20.4s, v7.4s, v10.4s\n" + "ldr q7, [x15, #0x80]\n" + "fmla v17.4s, v5.4s, v10.4s\n" + "ldr q5, [x15, #0x60]\n" + "fmla v16.4s, v4.4s, v10.4s\n" + "ldr q10, [x9, x13]\n" + "add x13, x13, #0x10\n" + "fmax v31.4s, v31.4s, v15.4s\n" + "ldr q4, [x15, #0x50]\n" + "cmp x13, x11, LSL #4\n" + "fmax v30.4s, v30.4s, v15.4s\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "fmax v29.4s, v29.4s, v15.4s\n" + "fmax v28.4s, v28.4s, v15.4s\n" + "fmin v31.4s, v31.4s, v14.4s\n" + "str q31, [x22, x12]\n" + "fmin v30.4s, v30.4s, v14.4s\n" + "fmin v29.4s, v29.4s, v14.4s\n" + "ldr x22, [x17, #0x20]\n" + "fmin v28.4s, v28.4s, v14.4s\n" + "str q30, [x21, x12]\n" + "fmax v27.4s, v27.4s, v15.4s\n" + "fmax v26.4s, v26.4s, v15.4s\n" + "str q29, [x20, x12]\n" + "fmax v25.4s, v25.4s, v15.4s\n" + "str q28, [x19, x12]\n" + "fmax v24.4s, v24.4s, v15.4s\n" + "ldr x21, [x17, #0x28]\n" + "fmin v27.4s, v27.4s, v14.4s\n" + "ldr x20, [x17, #0x30]\n" + "fmin v26.4s, v26.4s, v14.4s\n" + "ldr x19, [x17, #0x38]\n" + "fmin v25.4s, v25.4s, v14.4s\n" + "str q27, [x22, x12]\n" + "fmin v24.4s, v24.4s, v14.4s\n" + "str q26, [x21, x12]\n" + "fmax v23.4s, v23.4s, v15.4s\n" + "str q25, [x20, x12]\n" + "fmax v22.4s, v22.4s, v15.4s\n" + "str q24, [x19, x12]\n" + "fmax v21.4s, v21.4s, v15.4s\n" + "ldr x22, [x17, #0x40]\n" + "fmin v23.4s, v23.4s, v14.4s\n" + "ldr x21, [x17, #0x48]\n" + "fmin v22.4s, v22.4s, v14.4s\n" + "ldr x20, [x17, #0x50]\n" + "fmin v21.4s, v21.4s, v14.4s\n" + "str q23, [x22, x12]\n" + "fmax v20.4s, v20.4s, v15.4s\n" + "str q22, [x21, x12]\n" + "fmax v19.4s, v19.4s, v15.4s\n" + "str q21, [x20, x12]\n" + "fmax v18.4s, v18.4s, v15.4s\n" + "ldr x19, [x17, #0x58]\n" + "fmin v20.4s, v20.4s, v14.4s\n" + "ldr x22, [x17, #0x60]\n" + "fmin v19.4s, v19.4s, v14.4s\n" + "ldr x21, [x17, #0x68]\n" + "fmin v18.4s, v18.4s, v14.4s\n" + "str q20, [x19, x12]\n" + "fmax v17.4s, v17.4s, v15.4s\n" + "str q19, [x22, x12]\n" + "fmax v16.4s, v16.4s, v15.4s\n" + "str q18, [x21, x12]\n" + "ldr x20, [x17, #0x70]\n" + "fmin v17.4s, v17.4s, v14.4s\n" + "ldr x19, [x17, #0x78]\n" + "fmin v16.4s, v16.4s, v14.4s\n" + "str q17, [x20, x12]\n" + "str q16, [x19, x12]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x26, [x16, #0x20]\n" + "add x12, x12, #0x10\n" + "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "ldr x25, [x16, #0x28]\n" + "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "ldr x24, [x16, #0x30]\n" + "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n" + "ldr x23, [x16, #0x38]\n" + "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n" + "ldr x10, [x16, #0x40]\n" + "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n" + "ldr x9, [x16, #0x48]\n" + "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n" + "ldr x28, [x16, #0x50]\n" + "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n" + "ldr x27, [x16, #0x58]\n" + "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n" + "ldr q9, [x24, x14]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr q10, [x26, x14]\n" + "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n" + "ldr q11, [x25, x14]\n" + "fmla v30.4s, v8.4s, v12.4s\n" + "ldr x26, [x16, #0x60]\n" + "fmla v29.4s, v7.4s, v12.4s\n" + "ldr x25, [x16, #0x68]\n" + "fmla v26.4s, v5.4s, v12.4s\n" + "ldr x24, [x16, #0x70]\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "ldr x22, [x17, #0x0]\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "ldr x21, [x17, #0x8]\n" + "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n" + "ldr x20, [x17, #0x10]\n" + "fmla v22.4s, v2.4s, v12.4s\n" + "ldr x19, [x17, #0x18]\n" + "fmla v21.4s, v1.4s, v12.4s\n" + "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n" + "ldr q10, [x9, x14]\n" + "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n" + "ldr q11, [x10, x14]\n" + "fmla v27.4s, v8.4s, v9.4s\n" + "ldr x23, [x16, #0x78]\n" + "fmla v26.4s, v7.4s, v9.4s\n" + "ldr x10, [x16, #0x80]\n" + "fmla v25.4s, v6.4s, v9.4s\n" + "ldr x9, [x16, #0x88]\n" + "fmla v23.4s, v5.4s, v9.4s\n" + "fmla v22.4s, v4.4s, v9.4s\n" + "fmla v21.4s, v3.4s, v9.4s\n" + "fmla v19.4s, v2.4s, v9.4s\n" + "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n" + "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n" + "ldr q9, [x28, x14]\n" + "fmla v31.4s, v1.4s, v12.4s\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "ldr q12, [x27, x14]\n" + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr x27, [x16, #0x98]\n" + "fmla v28.4s, v1.4s, v11.4s\n" + "ldr q11, [x26, x14]\n" + "fmla v26.4s, v8.4s, v10.4s\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v25.4s, v7.4s, v10.4s\n" + "fmla v24.4s, v6.4s, v10.4s\n" + "fmla v22.4s, v5.4s, v10.4s\n" + "fmla v21.4s, v4.4s, v10.4s\n" + "fmla v20.4s, v3.4s, v10.4s\n" + "fmla v18.4s, v2.4s, v10.4s\n" + "fmla v17.4s, v1.4s, v10.4s\n" + "fmla v16.4s, v0.4s, v10.4s\n" + "ldr q10, [x25, x14]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v27.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v2.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "fmla v23.4s, v6.4s, v11.4s\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v19.4s, v3.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v31.4s, v5.4s, v10.4s\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v2.4s, v10.4s\n" + "fmla v26.4s, v1.4s, v10.4s\n" + "fmla v25.4s, v0.4s, v10.4s\n" + "ldr q10, [x9, x14]\n" + "fmla v20.4s, v8.4s, v11.4s\n" + "ldr x9, [x16, #0xc8]\n" + "fmla v16.4s, v5.4s, v11.4s\n" + "ldr q11, [x10, x14]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v26.4s, v2.4s, v12.4s\n" + "fmla v25.4s, v1.4s, v12.4s\n" + "fmla v24.4s, v0.4s, v12.4s\n" + "ldr q12, [x27, x14]\n" + "fmla v19.4s, v7.4s, v11.4s\n" + "ldr x27, [x16, #0xd8]\n" + "fmla v18.4s, v6.4s, v11.4s\n" + "ldr q11, [x28, x14]\n" + "fmla v31.4s, v7.4s, v10.4s\n" + "ldr x28, [x16, #0xd0]\n" + "fmla v30.4s, v6.4s, v10.4s\n" + "fmla v27.4s, v4.4s, v10.4s\n" + "fmla v26.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v22.4s, v0.4s, v10.4s\n" + "ldr q10, [x26, x14]\n" + "fmla v17.4s, v8.4s, v11.4s\n" + "ldr x26, [x16, #0xe0]\n" + "fmla v16.4s, v7.4s, v11.4s\n" + "ldr q11, [x25, x14]\n" + "fmla v29.4s, v8.4s, v12.4s\n" + "ldr x25, [x16, #0xe8]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "fmla v25.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v4.4s, v12.4s\n" + "fmla v21.4s, v2.4s, v12.4s\n" + "fmla v20.4s, v1.4s, v12.4s\n" + "ldr q12, [x24, x14]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr x24, [x16, #0xf0]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v0.4s, v10.4s\n" + "ldr q10, [x23, x14]\n" + "fmla v27.4s, v7.4s, v11.4s\n" + "ldr x23, [x16, #0xf8]\n" + "fmla v26.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v4.4s, v11.4s\n" + "fmla v22.4s, v3.4s, v11.4s\n" + "fmla v19.4s, v1.4s, v11.4s\n" + "fmla v18.4s, v0.4s, v11.4s\n" + "ldr q11, [x10, x14]\n" + "fmla v30.4s, v2.4s, v12.4s\n" + "ldr x10, [x16, #0x100]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr q12, [x9, x14]\n" + "fmla v31.4s, v6.4s, v10.4s\n" + "ldr x9, [x16, #0x108]\n" + "fmla v27.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v0.4s, v10.4s\n" + "ldr q10, [x28, x14]\n" + "fmla v25.4s, v8.4s, v11.4s\n" + "ldr x28, [x16, #0x110]\n" + "fmla v24.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v5.4s, v11.4s\n" + "fmla v20.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v2.4s, v11.4s\n" + "fmla v16.4s, v1.4s, v11.4s\n" + "ldr q11, [x27, x14]\n" + "fmla v28.4s, v8.4s, v12.4s\n" + "ldr x27, [x16, #0x118]\n" + "fmla v24.4s, v5.4s, v12.4s\n" + "fmla v20.4s, v2.4s, v12.4s\n" + "ldr q12, [x26, x14]\n" + "fmla v27.4s, v6.4s, v10.4s\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "fmla v19.4s, v0.4s, v10.4s\n" + "ldr q10, [x25, x14]\n" + "fmla v22.4s, v7.4s, v11.4s\n" + "fmla v21.4s, v6.4s, v11.4s\n" + "fmla v23.4s, v8.4s, v11.4s\n" + "fmla v19.4s, v5.4s, v11.4s\n" + "fmla v18.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v3.4s, v11.4s\n" + "ldr q11, [x24, x14]\n" + "fmla v24.4s, v8.4s, v12.4s\n" + "fmla v20.4s, v5.4s, v12.4s\n" + "fmla v16.4s, v2.4s, v12.4s\n" + "ldr q12, [x23, x14]\n" + "fmla v19.4s, v8.4s, v10.4s\n" + "fmla v18.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v6.4s, v10.4s\n" + "ldr q10, [x10, x14]\n" + "fmla v22.4s, v8.4s, v11.4s\n" + "fmla v21.4s, v7.4s, v11.4s\n" + "fmla v20.4s, v6.4s, v11.4s\n" + "fmla v18.4s, v5.4s, v11.4s\n" + "fmla v17.4s, v4.4s, v11.4s\n" + "fmla v16.4s, v3.4s, v11.4s\n" + "ldr q11, [x9, x14]\n" + "fmla v31.4s, v4.4s, v10.4s\n" + "fmla v18.4s, v8.4s, v12.4s\n" + "fmla v17.4s, v7.4s, v12.4s\n" + "fmla v16.4s, v6.4s, v12.4s\n" + "ldr q12, [x28, x14]\n" + "fmla v30.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v1.4s, v10.4s\n" + "fmla v26.4s, v0.4s, v10.4s\n" + "ldr q10, [x27, x14]\n" + "add x14, x14, #0x10\n" + "fmla v29.4s, v5.4s, v11.4s\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "fmla v25.4s, v2.4s, v11.4s\n" + "fmla v24.4s, v1.4s, v11.4s\n" + "fmla v23.4s, v7.4s, v12.4s\n" + "fmla v22.4s, v6.4s, v12.4s\n" + "fmla v19.4s, v4.4s, v12.4s\n" + "fmla v18.4s, v3.4s, v12.4s\n" + "fmla v21.4s, v8.4s, v10.4s\n" + "fmla v20.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v5.4s, v10.4s\n" + "fmla v16.4s, v4.4s, v10.4s\n" + "fmax v31.4s, v31.4s, v15.4s\n" + "fmax v30.4s, v30.4s, v15.4s\n" + "fmax v29.4s, v29.4s, v15.4s\n" + "fmin v31.4s, v31.4s, v14.4s\n" + "str q31, [x22, x12]\n" + "fmin v30.4s, v30.4s, v14.4s\n" + "fmin v29.4s, v29.4s, v14.4s\n" + "ldr x22, [x17, #0x20]\n" + "fmax v28.4s, v28.4s, v15.4s\n" + "str q30, [x21, x12]\n" + "fmax v27.4s, v27.4s, v15.4s\n" + "fmax v26.4s, v26.4s, v15.4s\n" + "str q29, [x20, x12]\n" + "fmin v28.4s, v28.4s, v14.4s\n" + "ldr x21, [x17, #0x28]\n" + "fmax v25.4s, v25.4s, v15.4s\n" + "ldr x20, [x17, #0x30]\n" + "fmin v27.4s, v27.4s, v14.4s\n" + "str q28, [x19, x12]\n" + "fmin v26.4s, v26.4s, v14.4s\n" + "ldr x19, [x17, #0x38]\n" + "fmin v25.4s, v25.4s, v14.4s\n" + "str q27, [x22, x12]\n" + "fmax v24.4s, v24.4s, v15.4s\n" + "str q26, [x21, x12]\n" + "fmax v23.4s, v23.4s, v15.4s\n" + "str q25, [x20, x12]\n" + "fmax v22.4s, v22.4s, v15.4s\n" + "ldr x22, [x17, #0x40]\n" + "fmin v24.4s, v24.4s, v14.4s\n" + "ldr x21, [x17, #0x48]\n" + "fmin v23.4s, v23.4s, v14.4s\n" + "ldr x20, [x17, #0x50]\n" + "fmin v22.4s, v22.4s, v14.4s\n" + "str q24, [x19, x12]\n" + "fmax v21.4s, v21.4s, v15.4s\n" + "str q23, [x22, x12]\n" + "fmax v20.4s, v20.4s, v15.4s\n" + "str q22, [x21, x12]\n" + "fmax v19.4s, v19.4s, v15.4s\n" + "ldr x19, [x17, #0x58]\n" + "fmin v21.4s, v21.4s, v14.4s\n" + "ldr x22, [x17, #0x60]\n" + "fmin v20.4s, v20.4s, v14.4s\n" + "ldr x21, [x17, #0x68]\n" + "fmin v19.4s, v19.4s, v14.4s\n" + "str q21, [x20, x12]\n" + "fmax v18.4s, v18.4s, v15.4s\n" + "str q20, [x19, x12]\n" + "fmax v17.4s, v17.4s, v15.4s\n" + "str q19, [x22, x12]\n" + "fmax v16.4s, v16.4s, v15.4s\n" + "ldr x20, [x17, #0x70]\n" + "fmin v18.4s, v18.4s, v14.4s\n" + "ldr x19, [x17, #0x78]\n" + "fmin v17.4s, v17.4s, v14.4s\n" + "str q18, [x21, x12]\n" + "fmin v16.4s, v16.4s, v14.4s\n" + "str q17, [x20, x12]\n" + "str q16, [x19, x12]\n" + "3:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 72f\n" + "ldr q13, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x12, x14\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x10, [x16, #0x0]\n" + "add x10, x10, x14\n" + "ldr x9, [x16, #0x8]\n" + "ldr x28, [x16, #0x10]\n" + "add x9, x9, x14\n" + "ldr x27, [x16, #0x18]\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.d }[0], [x10], #0x8\n" + "ld1 { v10.d }[0], [x9], #0x8\n" + "ld1 { v11.d }[0], [x28], #0x8\n" + "ld1 { v12.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.s }[2], [x10], #0x4\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v12.s }[2], [x27], #0x4\n" + "b 5f\n" + "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset + "ld1 { v9.s }[0], [x10], #0x4\n" + "ld1 { v10.s }[0], [x9], #0x4\n" + "ld1 { v11.s }[0], [x28], #0x4\n" + "ld1 { v12.s }[0], [x27], #0x4\n" + "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End + "mov v31.16b, v13.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x26, [x16, #0x20]\n" + "add x26, x26, x14\n" + "mov v30.16b, v13.16b\n fmla v30.4s, v7.4s, v9.4s\n" + "mov v29.16b, v13.16b\n fmla v29.4s, v6.4s, v9.4s\n" + "mov v27.16b, v13.16b\n fmla v27.4s, v5.4s, v9.4s\n" + "mov v26.16b, v13.16b\n fmla v26.4s, v4.4s, v9.4s\n" + "mov v25.16b, v13.16b\n fmla v25.4s, v3.4s, v9.4s\n" + "mov v23.16b, v13.16b\n fmla v23.4s, v2.4s, v9.4s\n" + "mov v22.16b, v13.16b\n fmla v22.4s, v1.4s, v9.4s\n" + "mov v21.16b, v13.16b\n fmla v21.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "mov v28.16b, v13.16b\n fmla v28.4s, v2.4s, v11.4s\n" + "fmla v30.4s, v8.4s, v12.4s\n" + "fmla v29.4s, v7.4s, v12.4s\n" + "fmla v26.4s, v5.4s, v12.4s\n" + "fmla v28.4s, v6.4s, v12.4s\n" + "fmla v25.4s, v4.4s, v12.4s\n" + "mov v24.16b, v13.16b\n fmla v24.4s, v3.4s, v12.4s\n" + "fmla v22.4s, v2.4s, v12.4s\n" + "fmla v21.4s, v1.4s, v12.4s\n" + "mov v20.16b, v13.16b\n fmla v20.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v10.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v10.s }[2], [x26], #0x4\n" + "b 7f\n" + "6:" // Oddments: Load input (5, 0): Bit 1: Unset + "ld1 { v10.s }[0], [x26], #0x4\n" + "7:" // Oddments: Load input (5, 0): Bit 1: End + "mov v19.16b, v13.16b\n fmla v19.4s, v6.4s, v10.4s\n" + "ldr x25, [x16, #0x28]\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v11.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v11.s }[2], [x25], #0x4\n" + "b 9f\n" + "8:" // Oddments: Load input (5, 5): Bit 1: Unset + "ld1 { v11.s }[0], [x25], #0x4\n" + "9:" // Oddments: Load input (5, 5): Bit 1: End + "mov v16.16b, v13.16b\n fmla v16.4s, v8.4s, v11.4s\n" + "ldr x24, [x16, #0x30]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v9.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v9.s }[2], [x24], #0x4\n" + "b 11f\n" + "10:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v9.s }[0], [x24], #0x4\n" + "11:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v27.4s, v8.4s, v9.4s\n" + "ldr x23, [x16, #0x38]\n" + "fmla v26.4s, v7.4s, v9.4s\n" + "add x23, x23, x14\n" + "fmla v25.4s, v6.4s, v9.4s\n" + "fmla v23.4s, v5.4s, v9.4s\n" + "fmla v22.4s, v4.4s, v9.4s\n" + "fmla v21.4s, v3.4s, v9.4s\n" + "fmla v19.4s, v2.4s, v9.4s\n" + "mov v18.16b, v13.16b\n fmla v18.4s, v1.4s, v9.4s\n" + "mov v17.16b, v13.16b\n fmla v17.4s, v0.4s, v9.4s\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v12.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v12.s }[2], [x23], #0x4\n" + "b 13f\n" + "12:" // Oddments: Load input (0, 1): Bit 1: Unset + "ld1 { v12.s }[0], [x23], #0x4\n" + "13:" // Oddments: Load input (0, 1): Bit 1: End + "fmla v31.4s, v1.4s, v12.4s\n" + "ldr x10, [x16, #0x40]\n" + "fmla v30.4s, v0.4s, v12.4s\n" + "add x10, x10, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v11.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v11.s }[2], [x10], #0x4\n" + "b 15f\n" + "14:" // Oddments: Load input (0, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x10], #0x4\n" + "15:" // Oddments: Load input (0, 4): Bit 1: End + "fmla v29.4s, v2.4s, v11.4s\n" + "ldr x9, [x16, #0x48]\n" + "fmla v28.4s, v1.4s, v11.4s\n" + "add x9, x9, x14\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v10.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "b 17f\n" + "16:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v10.s }[0], [x9], #0x4\n" + "17:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v26.4s, v8.4s, v10.4s\n" + "ldr x28, [x16, #0x50]\n" + "fmla v25.4s, v7.4s, v10.4s\n" + "add x28, x28, x14\n" + "fmla v24.4s, v6.4s, v10.4s\n" + "fmla v22.4s, v5.4s, v10.4s\n" + "fmla v21.4s, v4.4s, v10.4s\n" + "fmla v20.4s, v3.4s, v10.4s\n" + "fmla v18.4s, v2.4s, v10.4s\n" + "fmla v17.4s, v1.4s, v10.4s\n" + "fmla v16.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v9.d }[0], [x28], #0x8\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v9.s }[2], [x28], #0x4\n" + "b 19f\n" + "18:" // Oddments: Load input (1, 0): Bit 1: Unset + "ld1 { v9.s }[0], [x28], #0x4\n" + "19:" // Oddments: Load input (1, 0): Bit 1: End + "fmla v31.4s, v3.4s, v9.4s\n" + "ldr x27, [x16, #0x58]\n" + "fmla v27.4s, v0.4s, v9.4s\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v12.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v12.s }[2], [x27], #0x4\n" + "b 21f\n" + "20:" // Oddments: Load input (1, 5): Bit 1: Unset + "ld1 { v12.s }[0], [x27], #0x4\n" + "21:" // Oddments: Load input (1, 5): Bit 1: End + "fmla v28.4s, v5.4s, v12.4s\n" + "ldr x26, [x16, #0x60]\n" + "fmla v24.4s, v2.4s, v12.4s\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v11.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v11.s }[2], [x26], #0x4\n" + "b 23f\n" + "22:" // Oddments: Load input (4, 0): Bit 1: Unset + "ld1 { v11.s }[0], [x26], #0x4\n" + "23:" // Oddments: Load input (4, 0): Bit 1: End + "fmla v23.4s, v6.4s, v11.4s\n" + "ldr x25, [x16, #0x68]\n" + "fmla v19.4s, v3.4s, v11.4s\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v10.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v10.s }[2], [x25], #0x4\n" + "b 25f\n" + "24:" // Oddments: Load input (1, 2): Bit 1: Unset + "ld1 { v10.s }[0], [x25], #0x4\n" + "25:" // Oddments: Load input (1, 2): Bit 1: End + "fmla v31.4s, v5.4s, v10.4s\n" + "ldr x24, [x16, #0x70]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "add x24, x24, x14\n" + "fmla v29.4s, v3.4s, v10.4s\n" + "fmla v27.4s, v2.4s, v10.4s\n" + "fmla v26.4s, v1.4s, v10.4s\n" + "fmla v25.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "b 27f\n" + "26:" // Oddments: Load input (4, 5): Bit 1: Unset + "ld1 { v11.s }[0], [x24], #0x4\n" + "27:" // Oddments: Load input (4, 5): Bit 1: End + "fmla v20.4s, v8.4s, v11.4s\n" + "ldr x23, [x16, #0x78]\n" + "fmla v16.4s, v5.4s, v11.4s\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v12.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v12.s }[2], [x23], #0x4\n" + "b 29f\n" + "28:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v12.s }[0], [x23], #0x4\n" + "29:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr x10, [x16, #0x80]\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "add x10, x10, x14\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v26.4s, v2.4s, v12.4s\n" + "fmla v25.4s, v1.4s, v12.4s\n" + "fmla v24.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v11.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v11.s }[2], [x10], #0x4\n" + "b 31f\n" + "30:" // Oddments: Load input (5, 1): Bit 1: Unset + "ld1 { v11.s }[0], [x10], #0x4\n" + "31:" // Oddments: Load input (5, 1): Bit 1: End + "fmla v19.4s, v7.4s, v11.4s\n" + "ldr x9, [x16, #0x88]\n" + "fmla v18.4s, v6.4s, v11.4s\n" + "add x9, x9, x14\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v10.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "b 33f\n" + "32:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v10.s }[0], [x9], #0x4\n" + "33:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v31.4s, v7.4s, v10.4s\n" + "ldr x28, [x16, #0x90]\n" + "fmla v30.4s, v6.4s, v10.4s\n" + "add x28, x28, x14\n" + "fmla v27.4s, v4.4s, v10.4s\n" + "fmla v26.4s, v3.4s, v10.4s\n" + "fmla v23.4s, v1.4s, v10.4s\n" + "fmla v22.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v11.d }[0], [x28], #0x8\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "b 35f\n" + "34:" // Oddments: Load input (5, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x28], #0x4\n" + "35:" // Oddments: Load input (5, 4): Bit 1: End + "fmla v17.4s, v8.4s, v11.4s\n" + "ldr x27, [x16, #0x98]\n" + "fmla v16.4s, v7.4s, v11.4s\n" + "add x27, x27, x14\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v12.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v12.s }[2], [x27], #0x4\n" + "b 37f\n" + "36:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v12.s }[0], [x27], #0x4\n" + "37:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v29.4s, v8.4s, v12.4s\n" + "ldr x26, [x16, #0xa0]\n" + "fmla v28.4s, v7.4s, v12.4s\n" + "add x26, x26, x14\n" + "fmla v25.4s, v5.4s, v12.4s\n" + "fmla v24.4s, v4.4s, v12.4s\n" + "fmla v21.4s, v2.4s, v12.4s\n" + "fmla v20.4s, v1.4s, v12.4s\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v10.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v10.s }[2], [x26], #0x4\n" + "b 39f\n" + "38:" // Oddments: Load input (0, 2): Bit 1: Unset + "ld1 { v10.s }[0], [x26], #0x4\n" + "39:" // Oddments: Load input (0, 2): Bit 1: End + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr x25, [x16, #0xa8]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "add x25, x25, x14\n" + "fmla v29.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 40f\n" + "ld1 { v11.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 41f\n" + "ld1 { v11.s }[2], [x25], #0x4\n" + "b 41f\n" + "40:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v11.s }[0], [x25], #0x4\n" + "41:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v27.4s, v7.4s, v11.4s\n" + "ldr x24, [x16, #0xb0]\n" + "fmla v26.4s, v6.4s, v11.4s\n" + "add x24, x24, x14\n" + "fmla v23.4s, v4.4s, v11.4s\n" + "fmla v22.4s, v3.4s, v11.4s\n" + "fmla v19.4s, v1.4s, v11.4s\n" + "fmla v18.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 42f\n" + "ld1 { v12.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 43f\n" + "ld1 { v12.s }[2], [x24], #0x4\n" + "b 43f\n" + "42:" // Oddments: Load input (0, 3): Bit 1: Unset + "ld1 { v12.s }[0], [x24], #0x4\n" + "43:" // Oddments: Load input (0, 3): Bit 1: End + "fmla v30.4s, v2.4s, v12.4s\n" + "ldr x23, [x16, #0xb8]\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "add x23, x23, x14\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "tbz %x[n_channels], #1, 44f\n" + "ld1 { v10.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 45f\n" + "ld1 { v10.s }[2], [x23], #0x4\n" + "b 45f\n" + "44:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v10.s }[0], [x23], #0x4\n" + "45:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v10.4s\n" + "ldr x10, [x16, #0xc0]\n" + "fmla v27.4s, v3.4s, v10.4s\n" + "add x10, x10, x14\n" + "fmla v23.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 46f\n" + "ld1 { v11.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 47f\n" + "ld1 { v11.s }[2], [x10], #0x4\n" + "b 47f\n" + "46:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x10], #0x4\n" + "47:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v25.4s, v8.4s, v11.4s\n" + "ldr x9, [x16, #0xc8]\n" + "fmla v24.4s, v7.4s, v11.4s\n" + "add x9, x9, x14\n" + "fmla v21.4s, v5.4s, v11.4s\n" + "fmla v20.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v2.4s, v11.4s\n" + "fmla v16.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 48f\n" + "ld1 { v12.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 49f\n" + "ld1 { v12.s }[2], [x9], #0x4\n" + "b 49f\n" + "48:" // Oddments: Load input (2, 5): Bit 1: Unset + "ld1 { v12.s }[0], [x9], #0x4\n" + "49:" // Oddments: Load input (2, 5): Bit 1: End + "fmla v28.4s, v8.4s, v12.4s\n" + "ldr x28, [x16, #0xd0]\n" + "fmla v24.4s, v5.4s, v12.4s\n" + "add x28, x28, x14\n" + "fmla v20.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 50f\n" + "ld1 { v10.d }[0], [x28], #0x8\n" + "tbz %x[n_channels], #0, 51f\n" + "ld1 { v10.s }[2], [x28], #0x4\n" + "b 51f\n" + "50:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v10.s }[0], [x28], #0x4\n" + "51:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v27.4s, v6.4s, v10.4s\n" + "ldr x27, [x16, #0xd8]\n" + "fmla v23.4s, v3.4s, v10.4s\n" + "add x27, x27, x14\n" + "fmla v19.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 52f\n" + "ld1 { v11.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 53f\n" + "ld1 { v11.s }[2], [x27], #0x4\n" + "b 53f\n" + "52:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v11.s }[0], [x27], #0x4\n" + "53:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v23.4s, v8.4s, v11.4s\n" + "ldr x26, [x16, #0xe0]\n" + "fmla v22.4s, v7.4s, v11.4s\n" + "add x26, x26, x14\n" + "fmla v21.4s, v6.4s, v11.4s\n" + "fmla v19.4s, v5.4s, v11.4s\n" + "fmla v18.4s, v4.4s, v11.4s\n" + "fmla v17.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 54f\n" + "ld1 { v12.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 55f\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "b 55f\n" + "54:" // Oddments: Load input (3, 5): Bit 1: Unset + "ld1 { v12.s }[0], [x26], #0x4\n" + "55:" // Oddments: Load input (3, 5): Bit 1: End + "fmla v24.4s, v8.4s, v12.4s\n" + "ldr x25, [x16, #0xe8]\n" + "fmla v20.4s, v5.4s, v12.4s\n" + "add x25, x25, x14\n" + "fmla v16.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 56f\n" + "ld1 { v10.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 57f\n" + "ld1 { v10.s }[2], [x25], #0x4\n" + "b 57f\n" + "56:" // Oddments: Load input (5, 2): Bit 1: Unset + "ld1 { v10.s }[0], [x25], #0x4\n" + "57:" // Oddments: Load input (5, 2): Bit 1: End + "fmla v19.4s, v8.4s, v10.4s\n" + "ldr x24, [x16, #0xf0]\n" + "fmla v18.4s, v7.4s, v10.4s\n" + "add x24, x24, x14\n" + "fmla v17.4s, v6.4s, v10.4s\n" + "tbz %x[n_channels], #1, 58f\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 59f\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "b 59f\n" + "58:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v11.s }[0], [x24], #0x4\n" + "59:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v22.4s, v8.4s, v11.4s\n" + "ldr x23, [x16, #0xf8]\n" + "fmla v21.4s, v7.4s, v11.4s\n" + "add x23, x23, x14\n" + "fmla v20.4s, v6.4s, v11.4s\n" + "fmla v18.4s, v5.4s, v11.4s\n" + "fmla v17.4s, v4.4s, v11.4s\n" + "fmla v16.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 60f\n" + "ld1 { v12.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 61f\n" + "ld1 { v12.s }[2], [x23], #0x4\n" + "b 61f\n" + "60:" // Oddments: Load input (5, 3): Bit 1: Unset + "ld1 { v12.s }[0], [x23], #0x4\n" + "61:" // Oddments: Load input (5, 3): Bit 1: End + "fmla v18.4s, v8.4s, v12.4s\n" + "ldr x10, [x16, #0x100]\n" + "fmla v17.4s, v7.4s, v12.4s\n" + "add x10, x10, x14\n" + "fmla v16.4s, v6.4s, v12.4s\n" + "tbz %x[n_channels], #1, 62f\n" + "ld1 { v10.d }[0], [x10], #0x8\n" + "tbz %x[n_channels], #0, 63f\n" + "ld1 { v10.s }[2], [x10], #0x4\n" + "b 63f\n" + "62:" // Oddments: Load input (1, 1): Bit 1: Unset + "ld1 { v10.s }[0], [x10], #0x4\n" + "63:" // Oddments: Load input (1, 1): Bit 1: End + "fmla v31.4s, v4.4s, v10.4s\n" + "ldr x9, [x16, #0x108]\n" + "fmla v30.4s, v3.4s, v10.4s\n" + "add x9, x9, x14\n" + "fmla v27.4s, v1.4s, v10.4s\n" + "fmla v26.4s, v0.4s, v10.4s\n" + "tbz %x[n_channels], #1, 64f\n" + "ld1 { v11.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 65f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "b 65f\n" + "64:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x9], #0x4\n" + "65:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v29.4s, v5.4s, v11.4s\n" + "ldr x28, [x16, #0x110]\n" + "fmla v28.4s, v4.4s, v11.4s\n" + "add x28, x28, x14\n" + "fmla v25.4s, v2.4s, v11.4s\n" + "fmla v24.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 66f\n" + "ld1 { v12.d }[0], [x28], #0x8\n" + "tbz %x[n_channels], #0, 67f\n" + "ld1 { v12.s }[2], [x28], #0x4\n" + "b 67f\n" + "66:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v12.s }[0], [x28], #0x4\n" + "67:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v23.4s, v7.4s, v12.4s\n" + "ldr x27, [x16, #0x118]\n" + "fmla v22.4s, v6.4s, v12.4s\n" + "add x27, x27, x14\n" + "fmla v19.4s, v4.4s, v12.4s\n" + "fmla v18.4s, v3.4s, v12.4s\n" + "tbz %x[n_channels], #1, 68f\n" + "ld1 { v10.d }[0], [x27], #0x8\n" + "tbz %x[n_channels], #0, 69f\n" + "ld1 { v10.s }[2], [x27], #0x4\n" + "b 69f\n" + "68:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v10.s }[0], [x27], #0x4\n" + "69:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v21.4s, v8.4s, v10.4s\n" + "fmla v20.4s, v7.4s, v10.4s\n" + "fmla v17.4s, v5.4s, v10.4s\n" + "fmla v16.4s, v4.4s, v10.4s\n" + "fmax v31.4s, v31.4s, v15.4s\n" + "fmax v30.4s, v30.4s, v15.4s\n" + "fmax v29.4s, v29.4s, v15.4s\n" + "fmin v31.4s, v31.4s, v14.4s\n" + "fmin v30.4s, v30.4s, v14.4s\n" + "fmin v29.4s, v29.4s, v14.4s\n" + "fmax v28.4s, v28.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v15.4s\n" + "fmax v26.4s, v26.4s, v15.4s\n" + "fmin v28.4s, v28.4s, v14.4s\n" + "fmin v27.4s, v27.4s, v14.4s\n" + "fmin v26.4s, v26.4s, v14.4s\n" + "fmax v25.4s, v25.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v15.4s\n" + "fmax v23.4s, v23.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v14.4s\n" + "fmin v24.4s, v24.4s, v14.4s\n" + "fmin v23.4s, v23.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v15.4s\n" + "fmax v21.4s, v21.4s, v15.4s\n" + "fmax v20.4s, v20.4s, v15.4s\n" + "fmin v22.4s, v22.4s, v14.4s\n" + "fmin v21.4s, v21.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v15.4s\n" + "fmax v18.4s, v18.4s, v15.4s\n" + "fmax v17.4s, v17.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v14.4s\n" + "fmin v18.4s, v18.4s, v14.4s\n" + "fmin v17.4s, v17.4s, v14.4s\n" + "fmax v16.4s, v16.4s, v15.4s\n" + "fmin v16.4s, v16.4s, v14.4s\n" + "tbz %x[n_channels], #1, 70f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.d }[0], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.d }[0], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.d }[0], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.d }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.d }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.d }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.d }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.d }[0], [x19]\n" + "ldr x21, [x17, #0x48]\n" + "add x21, x21, x12\n" + "st1 { v23.d }[0], [x22]\n" + "ldr x20, [x17, #0x50]\n" + "add x20, x20, x12\n" + "st1 { v22.d }[0], [x21]\n" + "ldr x19, [x17, #0x58]\n" + "add x19, x19, x12\n" + "st1 { v21.d }[0], [x20]\n" + "ldr x22, [x17, #0x60]\n" + "add x22, x22, x12\n" + "st1 { v20.d }[0], [x19]\n" + "ldr x21, [x17, #0x68]\n" + "add x21, x21, x12\n" + "st1 { v19.d }[0], [x22]\n" + "ldr x20, [x17, #0x70]\n" + "add x20, x20, x12\n" + "st1 { v18.d }[0], [x21]\n" + "ldr x19, [x17, #0x78]\n" + "add x19, x19, x12\n" + "st1 { v17.d }[0], [x20]\n" + "add x12, x12, #0x8\n" + "st1 { v16.d }[0], [x19]\n" + "tbz %x[n_channels], #0, 71f\n" + "ldr x22, [x17, #0x0]\n" + "ldr x21, [x17, #0x8]\n" + "add x22, x22, x12\n" + "ldr x20, [x17, #0x10]\n" + "ldr x19, [x17, #0x18]\n" + "add x21, x21, x12\n" + "st1 { v31.s }[2], [x22]\n" + "add x20, x20, x12\n" + "st1 { v30.s }[2], [x21]\n" + "ldr x22, [x17, #0x20]\n" + "add x19, x19, x12\n" + "st1 { v29.s }[2], [x20]\n" + "add x22, x22, x12\n" + "st1 { v28.s }[2], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.s }[2], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.s }[2], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.s }[2], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.s }[2], [x19]\n" + "ldr x21, [x17, #0x48]\n" + "add x21, x21, x12\n" + "st1 { v23.s }[2], [x22]\n" + "ldr x20, [x17, #0x50]\n" + "add x20, x20, x12\n" + "st1 { v22.s }[2], [x21]\n" + "ldr x19, [x17, #0x58]\n" + "add x19, x19, x12\n" + "st1 { v21.s }[2], [x20]\n" + "ldr x22, [x17, #0x60]\n" + "add x22, x22, x12\n" + "st1 { v20.s }[2], [x19]\n" + "ldr x21, [x17, #0x68]\n" + "add x21, x21, x12\n" + "st1 { v19.s }[2], [x22]\n" + "ldr x20, [x17, #0x70]\n" + "add x20, x20, x12\n" + "st1 { v18.s }[2], [x21]\n" + "ldr x19, [x17, #0x78]\n" + "add x19, x19, x12\n" + "st1 { v17.s }[2], [x20]\n" + "st1 { v16.s }[2], [x19]\n" + "b 71f\n" + "70:" // Oddments: Store: Bit 1: Unset + "ldr x22, [x17, #0x0]\n" + "add x22, x22, x12\n" + "ldr x21, [x17, #0x8]\n" + "ldr x20, [x17, #0x10]\n" + "add x21, x21, x12\n" + "st1 { v31.s }[0], [x22]\n" + "ldr x19, [x17, #0x18]\n" + "add x20, x20, x12\n" + "st1 { v30.s }[0], [x21]\n" + "add x19, x19, x12\n" + "st1 { v29.s }[0], [x20]\n" + "ldr x22, [x17, #0x20]\n" + "add x22, x22, x12\n" + "st1 { v28.s }[0], [x19]\n" + "ldr x21, [x17, #0x28]\n" + "add x21, x21, x12\n" + "st1 { v27.s }[0], [x22]\n" + "ldr x20, [x17, #0x30]\n" + "add x20, x20, x12\n" + "st1 { v26.s }[0], [x21]\n" + "ldr x19, [x17, #0x38]\n" + "add x19, x19, x12\n" + "st1 { v25.s }[0], [x20]\n" + "ldr x22, [x17, #0x40]\n" + "add x22, x22, x12\n" + "st1 { v24.s }[0], [x19]\n" + "ldr x21, [x17, #0x48]\n" + "add x21, x21, x12\n" + "st1 { v23.s }[0], [x22]\n" + "ldr x20, [x17, #0x50]\n" + "add x20, x20, x12\n" + "st1 { v22.s }[0], [x21]\n" + "ldr x19, [x17, #0x58]\n" + "add x19, x19, x12\n" + "st1 { v21.s }[0], [x20]\n" + "ldr x22, [x17, #0x60]\n" + "add x22, x22, x12\n" + "st1 { v20.s }[0], [x19]\n" + "ldr x21, [x17, #0x68]\n" + "add x21, x21, x12\n" + "st1 { v19.s }[0], [x22]\n" + "ldr x20, [x17, #0x70]\n" + "add x20, x20, x12\n" + "st1 { v18.s }[0], [x21]\n" + "ldr x19, [x17, #0x78]\n" + "add x19, x19, x12\n" + "st1 { v17.s }[0], [x20]\n" + "st1 { v16.s }[0], [x19]\n" + "71:" // Oddments: Store: Bit 1: End + + "72:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..8eb560562b --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + + a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..4466ec1974 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,612 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x6, #0x0\n" + "mov x27, #0x0\n" + "1:" // Tile loop + "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x26, #0x4\n" + "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x25, #0x2\n" + "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x22, #0x0\n" + "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x6, x23\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col + "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x26\n" // offset *= kernel_stride * output_size + "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x17, x17, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1r { v19.4s }, [x24]\n" + "add x14, x17, x23, LSL #2\n" + "ld1r { v18.4s }, [x21]\n" + "add x13, x14, x23, LSL #2\n" + "lsl x8, x8, #0x2\n" + "add x12, x13, x23, LSL #2\n" + "add x11, x12, x23, LSL #2\n" + "add x10, x8, x8\n" + "add x9, x10, x8\n" + "add x28, x9, x8\n" + "mul x19, x6, x20\n" // offset = tile_i * ld_output_row + "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x25\n" // offset *= output_tile_size + "add x15, x15, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x27, x15, x20, LSL #2\n" + "lsl x16, x16, #0x2\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x2\n" + "cbz x19, 4f\n" + "ldr q17, [x7, #0x0]\n" + "ldr q0, [x7, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x7, #0x20]\n" + "ldr q2, [x7, #0x30]\n" + "ldr q3, [x7, #0x40]\n" + "ldr q4, [x7, #0x50]\n" + "ldr q5, [x7, #0x60]\n" + "ldr q6, [x7, #0x70]\n" + "ldr q7, [x7, #0x80]\n" + "ldr q8, [x7, #0x90]\n" + "add x7, x7, #0xa0\n" + "ldr q9, [x13, x10]\n" + "ld1 { v10.4s }, [x17]\n" + "ldr q11, [x17, x8]\n" + "ldr q12, [x17, x9]\n" + "ldr q13, [x17, x28]\n" + "ld1 { v14.4s }, [x14]\n" + "ldr q15, [x14, x8]\n" + "ldr q16, [x17, x10]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n" + "add x22, x22, #0x10\n" + "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n" + "add x17, x17, #0x10\n" + "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ldr q17, [x7, #0x0]\n" + "add x21, x21, #0x10\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ld1 { v10.4s }, [x17]\n" + "cmp x21, x19, LSL #4\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "ldr q12, [x14, x28]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x14, x9]\n" + "fmla v30.4s, v2.4s, v13.4s\n" + "ldr q13, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v3.4s, v14.4s\n" + "ld1 { v14.4s }, [x12]\n" + "fmla v30.4s, v0.4s, v16.4s\n" + "fmla v31.4s, v4.4s, v15.4s\n" + "ld1 { v15.4s }, [x13]\n" + "fmla v29.4s, v3.4s, v14.4s\n" + "ldr q14, [x12, x28]\n" + "fmla v30.4s, v4.4s, v11.4s\n" + "ldr q11, [x12, x8]\n" + "fmla v31.4s, v2.4s, v16.4s\n" + "ldr q16, [x13, x8]\n" + "fmla v29.4s, v0.4s, v15.4s\n" + "ldr q0, [x7, #0x10]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x13, x9]\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "ldr q11, [x13, x28]\n" + "add x13, x13, #0x10\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "ldr q13, [x12, x9]\n" + "ldr q9, [x13, x10]\n" + "fmla v31.4s, v6.4s, v15.4s\n" + "ld1 { v15.4s }, [x11]\n" + "fmla v29.4s, v1.4s, v16.4s\n" + "fmla v28.4s, v4.4s, v13.4s\n" + "ldr q13, [x11, x8]\n" + "fmla v30.4s, v7.4s, v12.4s\n" + "ldr q4, [x7, #0x50]\n" + "fmla v31.4s, v7.4s, v16.4s\n" + "ldr q16, [x12, x10]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v6.4s, v15.4s\n" + "ldr q15, [x11, x10]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q12, [x17, x9]\n" + "fmla v30.4s, v8.4s, v11.4s\n" + "ldr q1, [x7, #0x20]\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "fmla v29.4s, v7.4s, v13.4s\n" + "ldr q13, [x17, x28]\n" + "fmla v28.4s, v5.4s, v14.4s\n" + "ldr q14, [x11, x9]\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "st1 { v31.4s }, [x15]\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v5.4s, v16.4s\n" + "ldr q11, [x11, x28]\n" + "add x11, x11, #0x10\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "ldr q2, [x7, #0x30]\n" + "ldr q5, [x7, #0x60]\n" + "fmla v28.4s, v3.4s, v16.4s\n" + "ldr q16, [x17, x10]\n" + "fmla v29.4s, v8.4s, v15.4s\n" + "str q30, [x15, x16]\n" + "add x15, x15, #0x10\n" + "fmla v28.4s, v7.4s, v14.4s\n" + "ld1 { v14.4s }, [x14]\n" + "fmax v29.4s, v29.4s, v19.4s\n" + "ldr q3, [x7, #0x40]\n" + "ldr q7, [x7, #0x80]\n" + "fmin v29.4s, v29.4s, v18.4s\n" + "st1 { v29.4s }, [x27]\n" + "fmla v28.4s, v6.4s, v15.4s\n" + "ldr q15, [x14, x8]\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "ldr q11, [x17, x8]\n" + "ldr q6, [x7, #0x70]\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "ldr q8, [x7, #0x90]\n" + "add x7, x7, #0xa0\n" + "fmin v28.4s, v28.4s, v18.4s\n" + "str q28, [x27, x16]\n" + "add x27, x27, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x17, x17, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n" + "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n" + "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "ldr q12, [x14, x28]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x14, x9]\n" + "fmla v30.4s, v2.4s, v13.4s\n" + "ldr q13, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v31.4s, v3.4s, v14.4s\n" + "ld1 { v14.4s }, [x12]\n" + "fmla v30.4s, v0.4s, v16.4s\n" + "fmla v31.4s, v4.4s, v15.4s\n" + "ld1 { v15.4s }, [x13]\n" + "fmla v30.4s, v4.4s, v11.4s\n" + "ldr q11, [x12, x8]\n" + "fmla v29.4s, v3.4s, v14.4s\n" + "ldr q14, [x12, x28]\n" + "fmla v31.4s, v2.4s, v16.4s\n" + "ldr q16, [x13, x8]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x13, x9]\n" + "fmla v29.4s, v0.4s, v15.4s\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "ldr q13, [x12, x9]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "ldr q11, [x13, x28]\n" + "add x13, x13, #0x10\n" + "fmla v31.4s, v6.4s, v15.4s\n" + "ld1 { v15.4s }, [x11]\n" + "fmla v30.4s, v7.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v16.4s\n" + "fmla v28.4s, v4.4s, v13.4s\n" + "ldr q13, [x11, x8]\n" + "fmla v31.4s, v7.4s, v16.4s\n" + "ldr q16, [x12, x10]\n" + "add x12, x12, #0x10\n" + "fmla v29.4s, v6.4s, v15.4s\n" + "ldr q15, [x11, x10]\n" + "fmla v30.4s, v8.4s, v11.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "fmla v29.4s, v7.4s, v13.4s\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "fmla v28.4s, v5.4s, v14.4s\n" + "ldr q14, [x11, x9]\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "st1 { v31.4s }, [x15]\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v5.4s, v16.4s\n" + "ldr q11, [x11, x28]\n" + "add x11, x11, #0x10\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "str q30, [x15, x16]\n" + "fmla v28.4s, v3.4s, v16.4s\n" + "add x15, x15, #0x10\n" + "fmla v29.4s, v8.4s, v15.4s\n" + "fmla v28.4s, v7.4s, v14.4s\n" + "fmax v29.4s, v29.4s, v19.4s\n" + "fmla v28.4s, v6.4s, v15.4s\n" + "fmin v29.4s, v29.4s, v18.4s\n" + "st1 { v29.4s }, [x27]\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "fmin v28.4s, v28.4s, v18.4s\n" + "str q28, [x27, x16]\n" + "add x27, x27, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x3\n" + "beq 43f\n" + "ldr q17, [x7, #0x0]\n" + "ldr q0, [x7, #0x10]\n" + "add x26, x13, x10\n" + "ldr q1, [x7, #0x20]\n" + "add x25, x17, XZR\n" + "ldr q2, [x7, #0x30]\n" + "add x24, x17, x8\n" + "ldr q3, [x7, #0x40]\n" + "add x23, x17, x9\n" + "ldr q4, [x7, #0x50]\n" + "add x22, x17, x28\n" + "ldr q5, [x7, #0x60]\n" + "add x21, x14, XZR\n" + "ldr q6, [x7, #0x70]\n" + "add x20, x14, x8\n" + "ldr q7, [x7, #0x80]\n" + "add x19, x17, x10\n" + "ldr q8, [x7, #0x90]\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr d9, [x26], #0x8\n" + "ldr d10, [x25], #0x8\n" + "ldr d11, [x24], #0x8\n" + "ldr d12, [x23], #0x8\n" + "ldr d13, [x22], #0x8\n" + "ldr d14, [x21], #0x8\n" + "ldr d15, [x20], #0x8\n" + "ldr d16, [x19], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v9.s }[2], [x26]\n" + "ld1 { v10.s }[2], [x25]\n" + "ld1 { v11.s }[2], [x24]\n" + "ld1 { v12.s }[2], [x23]\n" + "ld1 { v13.s }[2], [x22]\n" + "ld1 { v14.s }[2], [x21]\n" + "ld1 { v15.s }[2], [x20]\n" + "ld1 { v16.s }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset + "ldr s9, [x26, #0x0]\n" + "ldr s10, [x25, #0x0]\n" + "ldr s11, [x24, #0x0]\n" + "ldr s12, [x23, #0x0]\n" + "ldr s13, [x22, #0x0]\n" + "ldr s14, [x21, #0x0]\n" + "ldr s15, [x20, #0x0]\n" + "ldr s16, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End + "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "add x19, x14, x9\n" + "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n" + "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n" + "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "fmla v30.4s, v2.4s, v13.4s\n" + "fmla v31.4s, v3.4s, v14.4s\n" + "fmla v30.4s, v0.4s, v16.4s\n" + "fmla v31.4s, v4.4s, v15.4s\n" + "fmla v31.4s, v2.4s, v16.4s\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v30.4s, v4.4s, v11.4s\n" + "add x19, x14, x28\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v30.4s, v5.4s, v12.4s\n" + "add x19, x14, x10\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End + "fmla v31.4s, v5.4s, v13.4s\n" + "add x19, x12, XZR\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr d14, [x19], #0x8\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v14.s }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr s14, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v29.4s, v3.4s, v14.4s\n" + "add x19, x13, XZR\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr d15, [x19], #0x8\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v15.s }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset + "ldr s15, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v15.4s\n" + "add x19, x12, x8\n" + "fmla v29.4s, v0.4s, v15.4s\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v29.4s, v4.4s, v11.4s\n" + "add x19, x13, x8\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr d16, [x19], #0x8\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v16.s }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr s16, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v31.4s, v7.4s, v16.4s\n" + "add x19, x12, x9\n" + "fmla v29.4s, v1.4s, v16.4s\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v28.4s, v4.4s, v13.4s\n" + "add x19, x13, x9\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v30.4s, v7.4s, v12.4s\n" + "add x19, x12, x28\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr d14, [x19], #0x8\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v14.s }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr s14, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v28.4s, v5.4s, v14.4s\n" + "add x19, x11, XZR\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr d15, [x19], #0x8\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v15.s }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset + "ldr s15, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End + "fmla v29.4s, v6.4s, v15.4s\n" + "add x19, x13, x28\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v30.4s, v8.4s, v11.4s\n" + "add x19, x11, x8\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v29.4s, v7.4s, v13.4s\n" + "add x19, x12, x10\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr d16, [x19], #0x8\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v16.s }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr s16, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v29.4s, v5.4s, v16.4s\n" + "add x19, x11, x9\n" + "fmla v28.4s, v3.4s, v16.4s\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr d14, [x19], #0x8\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v14.s }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr s14, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v28.4s, v7.4s, v14.4s\n" + "add x19, x11, x10\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr d15, [x19], #0x8\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v15.s }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr s15, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v29.4s, v8.4s, v15.4s\n" + "add x19, x11, x28\n" + "fmla v28.4s, v6.4s, v15.4s\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v28.4s, v8.4s, v11.4s\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "fmax v29.4s, v29.4s, v19.4s\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "fmin v29.4s, v29.4s, v18.4s\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "fmin v28.4s, v28.4s, v18.4s\n" + "tbz %x[n_channels], #1, 41f\n" + "mov x19, x15\n" + "st1 { v31.d }[0], [x19], x16\n" + "add x15, x15, #0x8\n" + "st1 { v30.d }[0], [x19]\n" + "mov x19, x27\n" + "st1 { v29.d }[0], [x19], x16\n" + "add x27, x27, #0x8\n" + "st1 { v28.d }[0], [x19]\n" + "tbz %x[n_channels], #0, 42f\n" + "mov x20, x15\n" + "st1 { v31.s }[2], [x20], x16\n" + "mov x19, x27\n" + "st1 { v30.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19], x16\n" + "st1 { v28.s }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x20, x15\n" + "st1 { v31.s }[0], [x20], x16\n" + "mov x19, x27\n" + "st1 { v30.s }[0], [x20]\n" + "st1 { v29.s }[0], [x19], x16\n" + "st1 { v28.s }[0], [x19]\n" + "42:" // Tile loop: Oddments: Store: Bit 1: End + + "43:" // Tile loop: End + "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x6, #0x1\n" + "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x27, x27, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x27, x19\n" + "csel x27, x27, XZR, LT\n" + "csel x6, x6, x21, LT\n" + "cmp x6, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..a5153019e7 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[25]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[1]; + inptrs[3] = input_ptrs[3]; + inptrs[4] = input_ptrs[4]; + inptrs[5] = input_ptrs[5]; + inptrs[6] = input_ptrs[6]; + inptrs[7] = input_ptrs[2]; + inptrs[8] = input_ptrs[8]; + inptrs[9] = input_ptrs[9]; + inptrs[10] = input_ptrs[7]; + inptrs[11] = input_ptrs[15]; + inptrs[12] = input_ptrs[10]; + inptrs[13] = input_ptrs[16]; + inptrs[14] = input_ptrs[11]; + inptrs[15] = input_ptrs[18]; + inptrs[16] = input_ptrs[13]; + inptrs[17] = input_ptrs[19]; + inptrs[18] = input_ptrs[20]; + inptrs[19] = input_ptrs[14]; + inptrs[20] = input_ptrs[21]; + inptrs[21] = input_ptrs[17]; + inptrs[22] = input_ptrs[23]; + inptrs[23] = input_ptrs[22]; + inptrs[24] = input_ptrs[24]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v19.4s }, [x20]\n" + "ld1r { v18.4s }, [x19]\n" + "mov x14, #0x0\n" + "ldp x13, x12, [x21, #0x0]\n" + "mov x11, #0x10\n" // cntb _, ALL, #1 + "ldp x10, x9, [x21, #0x10]\n" + "sub x28, XZR, x11\n" + "lsr x27, %x[n_channels], #0x2\n" + "cbz x27, 3f\n" + "ldr q17, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x11, x27, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldr q9, [x26, x14]\n" + "ldr q10, [x25, x14]\n" + "ldr q11, [x24, x14]\n" + "ldr q12, [x23, x14]\n" + "ldr q13, [x22, x14]\n" + "ldr q14, [x21, x14]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ldr q15, [x20, x14]\n" + "ldr q16, [x19, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x26, [x16, #0x40]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n" + "ldr x25, [x16, #0x48]\n" + "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n" + "ldr x24, [x16, #0x50]\n" + "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ldr x23, [x16, #0x58]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "ldr q12, [x25, x14]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x26, x14]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v30.4s, v2.4s, v13.4s\n" + "ldr q13, [x24, x14]\n" + "fmla v31.4s, v3.4s, v14.4s\n" + "ldr q14, [x23, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v30.4s, v0.4s, v16.4s\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.4s, v4.4s, v15.4s\n" + "ldr q15, [x22, x14]\n" + "fmla v29.4s, v3.4s, v14.4s\n" + "ldr x25, [x16, #0x88]\n" + "fmla v30.4s, v4.4s, v11.4s\n" + "ldr q11, [x21, x14]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.4s, v2.4s, v16.4s\n" + "ldr q16, [x20, x14]\n" + "fmla v29.4s, v0.4s, v15.4s\n" + "ldr q14, [x25, x14]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x26, x14]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "ldr q11, [x23, x14]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "ldr q13, [x19, x14]\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v31.4s, v6.4s, v15.4s\n" + "ldr q15, [x24, x14]\n" + "fmla v29.4s, v1.4s, v16.4s\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v30.4s, v7.4s, v12.4s\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.4s, v4.4s, v13.4s\n" + "ldr q13, [x22, x14]\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v31.4s, v7.4s, v16.4s\n" + "fmla v29.4s, v6.4s, v15.4s\n" + "ldr q16, [x21, x14]\n" + "fmla v30.4s, v8.4s, v11.4s\n" + "ldr q15, [x19, x14]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q17, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "fmla v29.4s, v7.4s, v13.4s\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "ldr q1, [x15, #0x20]\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "ldr q4, [x15, #0x50]\n" + "fmla v28.4s, v5.4s, v14.4s\n" + "ldr q14, [x20, x14]\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "str q31, [x13, x28]\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v5.4s, v16.4s\n" + "ldr q11, [x26, x14]\n" + "add x14, x14, #0x10\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "fmla v28.4s, v3.4s, v16.4s\n" + "ldp x22, x21, [x16, #0x20]\n" + "fmla v29.4s, v8.4s, v15.4s\n" + "ldr q9, [x26, x11]\n" + "ldr q10, [x25, x11]\n" + "fmla v28.4s, v7.4s, v14.4s\n" + "ldr q12, [x23, x11]\n" + "fmax v29.4s, v29.4s, v19.4s\n" + "ldr q13, [x22, x11]\n" + "ldr q14, [x21, x11]\n" + "fmin v29.4s, v29.4s, v18.4s\n" + "ldp x20, x19, [x16, #0x30]\n" + "str q30, [x12, x28]\n" + "fmla v28.4s, v6.4s, v15.4s\n" + "ldr q2, [x15, #0x30]\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "ldr q11, [x24, x11]\n" + "ldr q15, [x20, x11]\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "ldr q16, [x19, x11]\n" + "add x11, x11, #0x10\n" + "fmin v28.4s, v28.4s, v18.4s\n" + "str q29, [x10, x28]\n" + "cmp x11, x27, LSL #4\n" + "ldr q3, [x15, #0x40]\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "str q28, [x9, x28]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "add x15, x15, #0xa0\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x26, [x16, #0x40]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n" + "ldr x25, [x16, #0x48]\n" + "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n" + "ldr x24, [x16, #0x50]\n" + "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "ldr x23, [x16, #0x58]\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "ldr q12, [x25, x14]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x26, x14]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v30.4s, v2.4s, v13.4s\n" + "ldr q13, [x24, x14]\n" + "fmla v31.4s, v3.4s, v14.4s\n" + "ldr q14, [x23, x14]\n" + "ldr x19, [x16, #0x78]\n" + "fmla v30.4s, v0.4s, v16.4s\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.4s, v4.4s, v15.4s\n" + "ldr q15, [x22, x14]\n" + "fmla v29.4s, v3.4s, v14.4s\n" + "ldr x25, [x16, #0x88]\n" + "fmla v30.4s, v4.4s, v11.4s\n" + "ldr q11, [x21, x14]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.4s, v2.4s, v16.4s\n" + "ldr q16, [x20, x14]\n" + "fmla v29.4s, v0.4s, v15.4s\n" + "ldr q14, [x25, x14]\n" + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr q12, [x26, x14]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.4s, v5.4s, v13.4s\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v29.4s, v4.4s, v11.4s\n" + "ldr q11, [x23, x14]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "ldr q13, [x19, x14]\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v31.4s, v6.4s, v15.4s\n" + "ldr q15, [x24, x14]\n" + "fmla v29.4s, v1.4s, v16.4s\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v30.4s, v7.4s, v12.4s\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.4s, v4.4s, v13.4s\n" + "ldr q13, [x22, x14]\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v31.4s, v7.4s, v16.4s\n" + "fmla v29.4s, v6.4s, v15.4s\n" + "ldr q16, [x21, x14]\n" + "fmla v30.4s, v8.4s, v11.4s\n" + "ldr q15, [x19, x14]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v7.4s, v13.4s\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "fmla v28.4s, v5.4s, v14.4s\n" + "ldr q14, [x20, x14]\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "str q31, [x13, x28]\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v5.4s, v16.4s\n" + "ldr q11, [x26, x14]\n" + "add x14, x14, #0x10\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "str q30, [x12, x28]\n" + "fmla v28.4s, v3.4s, v16.4s\n" + "fmla v29.4s, v8.4s, v15.4s\n" + "fmla v28.4s, v7.4s, v14.4s\n" + "fmax v29.4s, v29.4s, v19.4s\n" + "fmin v29.4s, v29.4s, v18.4s\n" + "str q29, [x10, x28]\n" + "fmla v28.4s, v6.4s, v15.4s\n" + "fmla v28.4s, v8.4s, v11.4s\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "fmin v28.4s, v28.4s, v18.4s\n" + "str q28, [x9, x28]\n" + "3:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 42f\n" + "ldr q17, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x28, x14\n" + "ldr q1, [x15, #0x20]\n" + "add x13, x13, x28\n" + "ldr q2, [x15, #0x30]\n" + "add x12, x12, x28\n" + "ldr q3, [x15, #0x40]\n" + "add x10, x10, x28\n" + "ldr q4, [x15, #0x50]\n" + "add x9, x9, x28\n" + "ldr q5, [x15, #0x60]\n" + "ldr q6, [x15, #0x70]\n" + "ldr q7, [x15, #0x80]\n" + "ldr q8, [x15, #0x90]\n" + "ldr x26, [x16, #0x0]\n" + "ldr x25, [x16, #0x8]\n" + "ldr x24, [x16, #0x10]\n" + "add x26, x26, x14\n" + "ldr x23, [x16, #0x18]\n" + "add x25, x25, x14\n" + "ldr x22, [x16, #0x20]\n" + "add x24, x24, x14\n" + "ldr x21, [x16, #0x28]\n" + "add x23, x23, x14\n" + "ldr x20, [x16, #0x30]\n" + "add x22, x22, x14\n" + "ldr x19, [x16, #0x38]\n" + "add x21, x21, x14\n" + "add x20, x20, x14\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v9.d }[0], [x26], #0x8\n" + "ld1 { v10.d }[0], [x25], #0x8\n" + "ld1 { v11.d }[0], [x24], #0x8\n" + "ld1 { v12.d }[0], [x23], #0x8\n" + "ld1 { v13.d }[0], [x22], #0x8\n" + "ld1 { v14.d }[0], [x21], #0x8\n" + "ld1 { v15.d }[0], [x20], #0x8\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v9.s }[2], [x26], #0x4\n" + "ld1 { v10.s }[2], [x25], #0x4\n" + "ld1 { v11.s }[2], [x24], #0x4\n" + "ld1 { v12.s }[2], [x23], #0x4\n" + "ld1 { v13.s }[2], [x22], #0x4\n" + "ld1 { v14.s }[2], [x21], #0x4\n" + "ld1 { v15.s }[2], [x20], #0x4\n" + "ld1 { v16.s }[2], [x19], #0x4\n" + "b 5f\n" + "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset + "ld1 { v9.s }[0], [x26], #0x4\n" + "ld1 { v10.s }[0], [x25], #0x4\n" + "ld1 { v11.s }[0], [x24], #0x4\n" + "ld1 { v12.s }[0], [x23], #0x4\n" + "ld1 { v13.s }[0], [x22], #0x4\n" + "ld1 { v14.s }[0], [x21], #0x4\n" + "ld1 { v15.s }[0], [x20], #0x4\n" + "ld1 { v16.s }[0], [x19], #0x4\n" + "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End + "mov v31.16b, v17.16b\n fmla v31.4s, v8.4s, v9.4s\n" + "ldr x26, [x16, #0x40]\n" + "add x26, x26, x14\n" + "mov v30.16b, v17.16b\n fmla v30.4s, v6.4s, v9.4s\n" + "mov v29.16b, v17.16b\n fmla v29.4s, v2.4s, v9.4s\n" + "mov v28.16b, v17.16b\n fmla v28.4s, v0.4s, v9.4s\n" + "fmla v31.4s, v0.4s, v10.4s\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "fmla v30.4s, v2.4s, v13.4s\n" + "fmla v31.4s, v3.4s, v14.4s\n" + "fmla v30.4s, v0.4s, v16.4s\n" + "fmla v31.4s, v4.4s, v15.4s\n" + "fmla v31.4s, v2.4s, v16.4s\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v11.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v11.s }[2], [x26], #0x4\n" + "b 7f\n" + "6:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v11.s }[0], [x26], #0x4\n" + "7:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v30.4s, v4.4s, v11.4s\n" + "ldr x25, [x16, #0x48]\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v12.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v12.s }[2], [x25], #0x4\n" + "b 9f\n" + "8:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v12.s }[0], [x25], #0x4\n" + "9:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v30.4s, v5.4s, v12.4s\n" + "ldr x24, [x16, #0x50]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v13.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v13.s }[2], [x24], #0x4\n" + "b 11f\n" + "10:" // Oddments: Load input (1, 2): Bit 1: Unset + "ld1 { v13.s }[0], [x24], #0x4\n" + "11:" // Oddments: Load input (1, 2): Bit 1: End + "fmla v31.4s, v5.4s, v13.4s\n" + "ldr x23, [x16, #0x58]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v14.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v14.s }[2], [x23], #0x4\n" + "b 13f\n" + "12:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v14.s }[0], [x23], #0x4\n" + "13:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v29.4s, v3.4s, v14.4s\n" + "ldr x22, [x16, #0x60]\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v15.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v15.s }[2], [x22], #0x4\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 0): Bit 1: Unset + "ld1 { v15.s }[0], [x22], #0x4\n" + "15:" // Oddments: Load input (2, 0): Bit 1: End + "fmla v31.4s, v6.4s, v15.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v29.4s, v0.4s, v15.4s\n" + "add x21, x21, x14\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v11.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v11.s }[2], [x21], #0x4\n" + "b 17f\n" + "16:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v11.s }[0], [x21], #0x4\n" + "17:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v29.4s, v4.4s, v11.4s\n" + "ldr x20, [x16, #0x70]\n" + "add x20, x20, x14\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v16.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v16.s }[2], [x20], #0x4\n" + "b 19f\n" + "18:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v16.s }[0], [x20], #0x4\n" + "19:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v31.4s, v7.4s, v16.4s\n" + "ldr x19, [x16, #0x78]\n" + "fmla v29.4s, v1.4s, v16.4s\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v13.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v13.s }[2], [x19], #0x4\n" + "b 21f\n" + "20:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v13.s }[0], [x19], #0x4\n" + "21:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v28.4s, v4.4s, v13.4s\n" + "ldr x26, [x16, #0x80]\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v12.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "b 23f\n" + "22:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v12.s }[0], [x26], #0x4\n" + "23:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v30.4s, v7.4s, v12.4s\n" + "ldr x25, [x16, #0x88]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v14.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v14.s }[0], [x25], #0x4\n" + "25:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v28.4s, v5.4s, v14.4s\n" + "ldr x24, [x16, #0x90]\n" + "add x24, x24, x14\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v15.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v15.s }[2], [x24], #0x4\n" + "b 27f\n" + "26:" // Oddments: Load input (4, 0): Bit 1: Unset + "ld1 { v15.s }[0], [x24], #0x4\n" + "27:" // Oddments: Load input (4, 0): Bit 1: End + "fmla v29.4s, v6.4s, v15.4s\n" + "ldr x23, [x16, #0x98]\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v11.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v11.s }[2], [x23], #0x4\n" + "b 29f\n" + "28:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x23], #0x4\n" + "29:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v30.4s, v8.4s, v11.4s\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v13.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v13.s }[2], [x22], #0x4\n" + "b 31f\n" + "30:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v13.s }[0], [x22], #0x4\n" + "31:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v29.4s, v7.4s, v13.4s\n" + "ldr x21, [x16, #0xa8]\n" + "add x21, x21, x14\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v16.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v16.s }[2], [x21], #0x4\n" + "b 33f\n" + "32:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v16.s }[0], [x21], #0x4\n" + "33:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v29.4s, v5.4s, v16.4s\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v28.4s, v3.4s, v16.4s\n" + "add x20, x20, x14\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v14.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v14.s }[2], [x20], #0x4\n" + "b 35f\n" + "34:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v14.s }[0], [x20], #0x4\n" + "35:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v28.4s, v7.4s, v14.4s\n" + "ldr x19, [x16, #0xb8]\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v15.s }[2], [x19], #0x4\n" + "b 37f\n" + "36:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v15.s }[0], [x19], #0x4\n" + "37:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v29.4s, v8.4s, v15.4s\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v28.4s, v6.4s, v15.4s\n" + "add x26, x26, x14\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v11.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v11.s }[2], [x26], #0x4\n" + "b 39f\n" + "38:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v11.s }[0], [x26], #0x4\n" + "39:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v28.4s, v8.4s, v11.4s\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "fmax v29.4s, v29.4s, v19.4s\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "fmin v29.4s, v29.4s, v18.4s\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "fmin v28.4s, v28.4s, v18.4s\n" + "tbz %x[n_channels], #1, 40f\n" + "st1 { v31.d }[0], [x13], #0x8\n" + "st1 { v30.d }[0], [x12], #0x8\n" + "st1 { v29.d }[0], [x10], #0x8\n" + "st1 { v28.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 41f\n" + "st1 { v31.s }[2], [x13], #0x4\n" + "st1 { v30.s }[2], [x12], #0x4\n" + "st1 { v29.s }[2], [x10], #0x4\n" + "st1 { v28.s }[2], [x9], #0x4\n" + "b 41f\n" + "40:" // Oddments: Store: Bit 1: Unset + "st1 { v31.s }[0], [x13], #0x4\n" + "st1 { v30.s }[0], [x12], #0x4\n" + "st1 { v29.s }[0], [x10], #0x4\n" + "st1 { v28.s }[0], [x9], #0x4\n" + "41:" // Oddments: Store: Bit 1: End + + "42:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..314fe766de --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + + a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..170eb2267b --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,969 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "mov x28, #0x0\n" + "mov x27, #0x0\n" + "1:" // Tile loop + "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x26, #0x2\n" + "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x25, #0x2\n" + "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n" + "add x24, %x[params_struct], %[offsetof_args_min]\n" + "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "add x21, %x[params_struct], %[offsetof_args_max]\n" + "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "mov x22, #0x0\n" + "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x28, x23\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col + "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x26\n" // offset *= kernel_stride * output_size + "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x5, x5, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1r { v18.4s }, [x24]\n" + "add x8, x5, x23, LSL #2\n" + "ld1r { v17.4s }, [x21]\n" + "add x17, x8, x23, LSL #2\n" + "lsl x4, x4, #0x2\n" + "add x16, x17, x23, LSL #2\n" + "add x15, x16, x23, LSL #2\n" + "add x14, x15, x23, LSL #2\n" + "add x13, x4, x4\n" + "add x12, x13, x4\n" + "add x11, x12, x4\n" + "add x10, x11, x4\n" + "mul x19, x28, x20\n" // offset = tile_i * ld_output_row + "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x25\n" // offset *= output_tile_size + "add x7, x7, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x9, x7, x20, LSL #2\n" + "lsl x6, x6, #0x2\n" + "mov x21, #0x10\n" // cntb _, ALL, #1 + "sub x20, XZR, x21\n" + "lsr x19, %x[n_channels], #0x2\n" + "cbz x19, 4f\n" + "ldr q16, [x3, #0x0]\n" + "ldr q0, [x3, #0x10]\n" + "cmp x21, x19, LSL #4\n" + "ldr q1, [x3, #0x20]\n" + "ldr q2, [x3, #0x30]\n" + "ldr q3, [x3, #0x40]\n" + "ldr q4, [x3, #0x50]\n" + "add x3, x3, #0x60\n" + "ld1 { v5.4s }, [x5]\n" + "ldr q6, [x5, x4]\n" + "ld1 { v7.4s }, [x8]\n" + "ldr q8, [x8, x4]\n" + "ldr q9, [x5, x13]\n" + "ldr q13, [x8, x13]\n" + "ldr q11, [x5, x12]\n" + "ldr q12, [x5, x11]\n" + "ldr q10, [x8, x10]\n" + "ld1 { v14.4s }, [x17]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n" + "ldr q5, [x8, x12]\n" + "add x20, x20, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n" + "add x22, x22, #0x10\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n" + "add x21, x21, #0x10\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n" + "ldr q0, [x3, #0x0]\n" + "cmp x21, x19, LSL #4\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x8, x11]\n" + "add x8, x8, #0x10\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "ldr q16, [x3, #0x140]\n" + "fmla v29.4s, v1.4s, v8.4s\n" + "fmla v28.4s, v1.4s, v13.4s\n" + "ldr q1, [x3, #0x10]\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "ldr q9, [x5, x10]\n" + "add x5, x5, #0x10\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v13.4s\n" + "fmla v28.4s, v2.4s, v5.4s\n" + "ldr q2, [x3, #0x20]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x17, x4]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v5.4s\n" + "fmla v28.4s, v3.4s, v6.4s\n" + "ldr q3, [x3, #0x30]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x17, x13]\n" + "fmla v30.4s, v4.4s, v9.4s\n" + "ldr q9, [x17, x12]\n" + "fmla v29.4s, v4.4s, v6.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x3, #0x40]\n" + "fmla v31.4s, v0.4s, v7.4s\n" + "ld1 { v7.4s }, [x8]\n" + "fmla v30.4s, v0.4s, v8.4s\n" + "fmla v29.4s, v0.4s, v14.4s\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q0, [x3, #0x50]\n" + "fmla v31.4s, v1.4s, v8.4s\n" + "ldr q8, [x17, x10]\n" + "fmla v30.4s, v1.4s, v13.4s\n" + "fmla v29.4s, v1.4s, v11.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q1, [x3, #0x60]\n" + "fmla v31.4s, v2.4s, v13.4s\n" + "ldr q13, [x17, x11]\n" + "add x17, x17, #0x10\n" + "fmla v30.4s, v2.4s, v5.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "fmla v28.4s, v2.4s, v9.4s\n" + "ldr q2, [x3, #0x70]\n" + "fmla v31.4s, v3.4s, v5.4s\n" + "ld1 { v5.4s }, [x16]\n" + "fmla v30.4s, v3.4s, v6.4s\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr q3, [x3, #0x80]\n" + "fmla v31.4s, v4.4s, v6.4s\n" + "ldr q6, [x16, x4]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "ldr q10, [x16, x13]\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "fmla v28.4s, v4.4s, v8.4s\n" + "ldr q4, [x3, #0x90]\n" + "fmla v31.4s, v0.4s, v14.4s\n" + "ldr q14, [x16, x10]\n" + "fmla v30.4s, v0.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v5.4s\n" + "fmla v28.4s, v0.4s, v6.4s\n" + "ldr q0, [x3, #0xa0]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x16, x12]\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v6.4s\n" + "fmla v28.4s, v1.4s, v10.4s\n" + "ldr q1, [x3, #0xb0]\n" + "fmla v31.4s, v2.4s, v12.4s\n" + "ldr q12, [x16, x11]\n" + "add x16, x16, #0x10\n" + "fmla v30.4s, v2.4s, v9.4s\n" + "fmla v29.4s, v2.4s, v10.4s\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr q2, [x3, #0xc0]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ld1 { v9.4s }, [x15]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr q3, [x3, #0xd0]\n" + "fmla v31.4s, v4.4s, v13.4s\n" + "ldr q13, [x15, x4]\n" + "fmla v30.4s, v4.4s, v8.4s\n" + "ldr q8, [x15, x11]\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v4.4s, v14.4s\n" + "ldr q4, [x3, #0xe0]\n" + "fmla v31.4s, v0.4s, v5.4s\n" + "ldr q5, [x15, x13]\n" + "fmla v30.4s, v0.4s, v6.4s\n" + "fmla v29.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v0.4s, v13.4s\n" + "ldr q0, [x3, #0xf0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x15, x12]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v1.4s, v13.4s\n" + "fmla v28.4s, v1.4s, v5.4s\n" + "ldr q1, [x3, #0x100]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr q10, [x15, x10]\n" + "add x15, x15, #0x10\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v5.4s\n" + "fmla v28.4s, v2.4s, v6.4s\n" + "ldr q2, [x3, #0x110]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ld1 { v11.4s }, [x14]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v6.4s\n" + "fmla v28.4s, v3.4s, v8.4s\n" + "ldr q3, [x3, #0x120]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x14, x4]\n" + "fmla v30.4s, v4.4s, v14.4s\n" + "ld1 { v14.4s }, [x17]\n" + "fmla v29.4s, v4.4s, v8.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x3, #0x130]\n" + "fmla v31.4s, v0.4s, v9.4s\n" + "ldr q9, [x14, x13]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x14, x12]\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr q0, [x3, #0x150]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "ldr q13, [x8, x13]\n" + "fmla v30.4s, v1.4s, v5.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "ldr q12, [x14, x11]\n" + "fmla v28.4s, v1.4s, v9.4s\n" + "ldr q1, [x3, #0x160]\n" + "fmla v31.4s, v2.4s, v5.4s\n" + "ld1 { v5.4s }, [x5]\n" + "fmla v30.4s, v2.4s, v6.4s\n" + "fmla v29.4s, v2.4s, v9.4s\n" + "ldr q9, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr q2, [x3, #0x170]\n" + "fmla v31.4s, v3.4s, v6.4s\n" + "ldr q6, [x5, x4]\n" + "fmla v30.4s, v3.4s, v8.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "ldr q11, [x5, x12]\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr q3, [x3, #0x180]\n" + "fmla v31.4s, v4.4s, v8.4s\n" + "ldr q8, [x8, x4]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "ldr q10, [x8, x10]\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "ldr q12, [x5, x11]\n" + "fmla v28.4s, v4.4s, v9.4s\n" + "ldr q9, [x5, x13]\n" + "ldr q4, [x3, #0x190]\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "add x3, x3, #0x1a0\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "st1 { v31.4s }, [x7]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "str q30, [x7, x6]\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "add x7, x7, #0x10\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "st1 { v29.4s }, [x9]\n" + "str q28, [x9, x6]\n" + "add x9, x9, #0x10\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n" + "ldr q5, [x8, x12]\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n" + "ldr q0, [x3, #0x0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x8, x11]\n" + "add x8, x8, #0x10\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "fmla v29.4s, v1.4s, v8.4s\n" + "fmla v28.4s, v1.4s, v13.4s\n" + "ldr q1, [x3, #0x10]\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "ldr q9, [x5, x10]\n" + "add x5, x5, #0x10\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v13.4s\n" + "fmla v28.4s, v2.4s, v5.4s\n" + "ldr q2, [x3, #0x20]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x17, x4]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v5.4s\n" + "fmla v28.4s, v3.4s, v6.4s\n" + "ldr q3, [x3, #0x30]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x17, x13]\n" + "fmla v30.4s, v4.4s, v9.4s\n" + "ldr q9, [x17, x12]\n" + "fmla v29.4s, v4.4s, v6.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x3, #0x40]\n" + "fmla v31.4s, v0.4s, v7.4s\n" + "fmla v30.4s, v0.4s, v8.4s\n" + "fmla v29.4s, v0.4s, v14.4s\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q0, [x3, #0x50]\n" + "fmla v31.4s, v1.4s, v8.4s\n" + "ldr q8, [x17, x10]\n" + "fmla v30.4s, v1.4s, v13.4s\n" + "fmla v29.4s, v1.4s, v11.4s\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q1, [x3, #0x60]\n" + "fmla v31.4s, v2.4s, v13.4s\n" + "ldr q13, [x17, x11]\n" + "add x17, x17, #0x10\n" + "fmla v30.4s, v2.4s, v5.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "fmla v28.4s, v2.4s, v9.4s\n" + "ldr q2, [x3, #0x70]\n" + "fmla v31.4s, v3.4s, v5.4s\n" + "ld1 { v5.4s }, [x16]\n" + "fmla v30.4s, v3.4s, v6.4s\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr q3, [x3, #0x80]\n" + "fmla v31.4s, v4.4s, v6.4s\n" + "ldr q6, [x16, x4]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "ldr q10, [x16, x13]\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "fmla v28.4s, v4.4s, v8.4s\n" + "ldr q4, [x3, #0x90]\n" + "fmla v31.4s, v0.4s, v14.4s\n" + "ldr q14, [x16, x10]\n" + "fmla v30.4s, v0.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v5.4s\n" + "fmla v28.4s, v0.4s, v6.4s\n" + "ldr q0, [x3, #0xa0]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x16, x12]\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v6.4s\n" + "fmla v28.4s, v1.4s, v10.4s\n" + "ldr q1, [x3, #0xb0]\n" + "fmla v31.4s, v2.4s, v12.4s\n" + "ldr q12, [x16, x11]\n" + "add x16, x16, #0x10\n" + "fmla v30.4s, v2.4s, v9.4s\n" + "fmla v29.4s, v2.4s, v10.4s\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr q2, [x3, #0xc0]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ld1 { v9.4s }, [x15]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr q3, [x3, #0xd0]\n" + "fmla v31.4s, v4.4s, v13.4s\n" + "ldr q13, [x15, x4]\n" + "fmla v30.4s, v4.4s, v8.4s\n" + "ldr q8, [x15, x11]\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v4.4s, v14.4s\n" + "ldr q4, [x3, #0xe0]\n" + "fmla v31.4s, v0.4s, v5.4s\n" + "ldr q5, [x15, x13]\n" + "fmla v30.4s, v0.4s, v6.4s\n" + "fmla v29.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v0.4s, v13.4s\n" + "ldr q0, [x3, #0xf0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x15, x12]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v1.4s, v13.4s\n" + "fmla v28.4s, v1.4s, v5.4s\n" + "ldr q1, [x3, #0x100]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr q10, [x15, x10]\n" + "add x15, x15, #0x10\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v5.4s\n" + "fmla v28.4s, v2.4s, v6.4s\n" + "ldr q2, [x3, #0x110]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ld1 { v11.4s }, [x14]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v6.4s\n" + "fmla v28.4s, v3.4s, v8.4s\n" + "ldr q3, [x3, #0x120]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x14, x4]\n" + "fmla v30.4s, v4.4s, v14.4s\n" + "fmla v29.4s, v4.4s, v8.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x3, #0x130]\n" + "add x3, x3, #0x140\n" + "fmla v31.4s, v0.4s, v9.4s\n" + "ldr q9, [x14, x13]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x14, x12]\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "fmla v30.4s, v1.4s, v5.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "ldr q12, [x14, x11]\n" + "fmla v28.4s, v1.4s, v9.4s\n" + "fmla v31.4s, v2.4s, v5.4s\n" + "fmla v30.4s, v2.4s, v6.4s\n" + "fmla v29.4s, v2.4s, v9.4s\n" + "ldr q9, [x14, x10]\n" + "add x14, x14, #0x10\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "fmla v31.4s, v3.4s, v6.4s\n" + "fmla v30.4s, v3.4s, v8.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v31.4s, v4.4s, v8.4s\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v4.4s, v9.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "st1 { v31.4s }, [x7]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "str q30, [x7, x6]\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "add x7, x7, #0x10\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "st1 { v29.4s }, [x9]\n" + "str q28, [x9, x6]\n" + "add x9, x9, #0x10\n" + "4:" // Tile loop: Oddments + "tst %x[n_channels], #0x3\n" + "beq 61f\n" + "ldr q16, [x3, #0x0]\n" + "ldr q0, [x3, #0x10]\n" + "add x28, x5, XZR\n" + "ldr q1, [x3, #0x20]\n" + "add x27, x5, x4\n" + "ldr q2, [x3, #0x30]\n" + "add x26, x8, XZR\n" + "ldr q3, [x3, #0x40]\n" + "add x25, x8, x4\n" + "ldr q4, [x3, #0x50]\n" + "add x24, x5, x13\n" + "add x23, x8, x13\n" + "add x22, x5, x12\n" + "add x21, x5, x11\n" + "add x20, x8, x10\n" + "add x19, x17, XZR\n" + "add x3, x3, #0x60\n" + "tbz %x[n_channels], #1, 5f\n" + "ldr d5, [x28], #0x8\n" + "ldr d6, [x27], #0x8\n" + "ldr d7, [x26], #0x8\n" + "ldr d8, [x25], #0x8\n" + "ldr d9, [x24], #0x8\n" + "ldr d13, [x23], #0x8\n" + "ldr d11, [x22], #0x8\n" + "ldr d12, [x21], #0x8\n" + "ldr d10, [x20], #0x8\n" + "ldr d14, [x19], #0x8\n" + "tbz %x[n_channels], #0, 6f\n" + "ld1 { v5.s }[2], [x28]\n" + "ld1 { v6.s }[2], [x27]\n" + "ld1 { v7.s }[2], [x26]\n" + "ld1 { v8.s }[2], [x25]\n" + "ld1 { v9.s }[2], [x24]\n" + "ld1 { v13.s }[2], [x23]\n" + "ld1 { v11.s }[2], [x22]\n" + "ld1 { v12.s }[2], [x21]\n" + "ld1 { v10.s }[2], [x20]\n" + "ld1 { v14.s }[2], [x19]\n" + "b 6f\n" + "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset + "ldr s5, [x28, #0x0]\n" + "ldr s6, [x27, #0x0]\n" + "ldr s7, [x26, #0x0]\n" + "ldr s8, [x25, #0x0]\n" + "ldr s9, [x24, #0x0]\n" + "ldr s13, [x23, #0x0]\n" + "ldr s11, [x22, #0x0]\n" + "ldr s12, [x21, #0x0]\n" + "ldr s10, [x20, #0x0]\n" + "ldr s14, [x19, #0x0]\n" + "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n" + "add x19, x8, x12\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "fmla v29.4s, v1.4s, v8.4s\n" + "fmla v28.4s, v1.4s, v13.4s\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v13.4s\n" + "tbz %x[n_channels], #1, 7f\n" + "ldr d5, [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v5.s }[2], [x19]\n" + "b 8f\n" + "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset + "ldr s5, [x19, #0x0]\n" + "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End + "fmla v28.4s, v2.4s, v5.4s\n" + "add x19, x8, x11\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v5.4s\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr d6, [x19], #0x8\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v6.s }[2], [x19]\n" + "b 10f\n" + "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset + "ldr s6, [x19, #0x0]\n" + "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End + "fmla v28.4s, v3.4s, v6.4s\n" + "add x19, x5, x10\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 11f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 12f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 12f\n" + "11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End + "fmla v30.4s, v4.4s, v9.4s\n" + "ldr s0, [x3, #0x18]\n" + "add x19, x17, x4\n" + "fmla v29.4s, v4.4s, v6.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "fmla v31.4s, v0.4s, v7.4s\n" + "fmla v30.4s, v0.4s, v8.4s\n" + "fmla v29.4s, v0.4s, v14.4s\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 14f\n" + "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr s1, [x3, #0x1c]\n" + "add x19, x17, x13\n" + "fmla v31.4s, v1.4s, v8.4s\n" + "fmla v30.4s, v1.4s, v13.4s\n" + "fmla v29.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 15f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 16f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 16f\n" + "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr s2, [x3, #0x20]\n" + "add x19, x17, x12\n" + "fmla v31.4s, v2.4s, v13.4s\n" + "fmla v30.4s, v2.4s, v5.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 17f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 18f\n" + "17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End + "fmla v28.4s, v2.4s, v9.4s\n" + "ldr s3, [x3, #0x24]\n" + "add x19, x17, x11\n" + "fmla v31.4s, v3.4s, v5.4s\n" + "fmla v30.4s, v3.4s, v6.4s\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "tbz %x[n_channels], #1, 19f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 20f\n" + "19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr s4, [x3, #0x28]\n" + "add x19, x17, x10\n" + "fmla v31.4s, v4.4s, v6.4s\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "tbz %x[n_channels], #1, 21f\n" + "ldr d8, [x19], #0x8\n" + "tbz %x[n_channels], #0, 22f\n" + "ld1 { v8.s }[2], [x19]\n" + "b 22f\n" + "21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset + "ldr s8, [x19, #0x0]\n" + "22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End + "fmla v28.4s, v4.4s, v8.4s\n" + "ldr s0, [x3, #0x2c]\n" + "add x19, x16, XZR\n" + "fmla v31.4s, v0.4s, v14.4s\n" + "fmla v30.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 23f\n" + "ldr d5, [x19], #0x8\n" + "tbz %x[n_channels], #0, 24f\n" + "ld1 { v5.s }[2], [x19]\n" + "b 24f\n" + "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset + "ldr s5, [x19, #0x0]\n" + "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End + "fmla v29.4s, v0.4s, v5.4s\n" + "add x19, x16, x4\n" + "tbz %x[n_channels], #1, 25f\n" + "ldr d6, [x19], #0x8\n" + "tbz %x[n_channels], #0, 26f\n" + "ld1 { v6.s }[2], [x19]\n" + "b 26f\n" + "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset + "ldr s6, [x19, #0x0]\n" + "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End + "fmla v28.4s, v0.4s, v6.4s\n" + "ldr s1, [x3, #0x30]\n" + "add x19, x16, x13\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v6.4s\n" + "tbz %x[n_channels], #1, 27f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 28f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 28f\n" + "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End + "fmla v28.4s, v1.4s, v10.4s\n" + "ldr s2, [x3, #0x34]\n" + "add x19, x16, x12\n" + "fmla v31.4s, v2.4s, v12.4s\n" + "fmla v30.4s, v2.4s, v9.4s\n" + "fmla v29.4s, v2.4s, v10.4s\n" + "tbz %x[n_channels], #1, 29f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 30f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 30f\n" + "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr s3, [x3, #0x38]\n" + "add x19, x16, x11\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 31f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 32f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 32f\n" + "31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr s4, [x3, #0x3c]\n" + "add x19, x16, x10\n" + "fmla v31.4s, v4.4s, v13.4s\n" + "fmla v30.4s, v4.4s, v8.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 33f\n" + "ldr d14, [x19], #0x8\n" + "tbz %x[n_channels], #0, 34f\n" + "ld1 { v14.s }[2], [x19]\n" + "b 34f\n" + "33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset + "ldr s14, [x19, #0x0]\n" + "34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End + "fmla v28.4s, v4.4s, v14.4s\n" + "ldr s0, [x3, #0x40]\n" + "add x19, x15, XZR\n" + "fmla v31.4s, v0.4s, v5.4s\n" + "fmla v30.4s, v0.4s, v6.4s\n" + "tbz %x[n_channels], #1, 35f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 36f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 36f\n" + "35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End + "fmla v29.4s, v0.4s, v9.4s\n" + "add x19, x15, x4\n" + "tbz %x[n_channels], #1, 37f\n" + "ldr d13, [x19], #0x8\n" + "tbz %x[n_channels], #0, 38f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 38f\n" + "37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset + "ldr s13, [x19, #0x0]\n" + "38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End + "fmla v28.4s, v0.4s, v13.4s\n" + "ldr s1, [x3, #0x44]\n" + "add x19, x15, x13\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v1.4s, v13.4s\n" + "tbz %x[n_channels], #1, 39f\n" + "ldr d5, [x19], #0x8\n" + "tbz %x[n_channels], #0, 40f\n" + "ld1 { v5.s }[2], [x19]\n" + "b 40f\n" + "39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset + "ldr s5, [x19, #0x0]\n" + "40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End + "fmla v28.4s, v1.4s, v5.4s\n" + "ldr s2, [x3, #0x48]\n" + "add x19, x15, x12\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v5.4s\n" + "tbz %x[n_channels], #1, 41f\n" + "ldr d6, [x19], #0x8\n" + "tbz %x[n_channels], #0, 42f\n" + "ld1 { v6.s }[2], [x19]\n" + "b 42f\n" + "41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset + "ldr s6, [x19, #0x0]\n" + "42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End + "fmla v28.4s, v2.4s, v6.4s\n" + "ldr s3, [x3, #0x4c]\n" + "add x19, x15, x11\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v6.4s\n" + "tbz %x[n_channels], #1, 43f\n" + "ldr d8, [x19], #0x8\n" + "tbz %x[n_channels], #0, 44f\n" + "ld1 { v8.s }[2], [x19]\n" + "b 44f\n" + "43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset + "ldr s8, [x19, #0x0]\n" + "44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End + "fmla v28.4s, v3.4s, v8.4s\n" + "ldr s4, [x3, #0x50]\n" + "add x19, x15, x10\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "fmla v30.4s, v4.4s, v14.4s\n" + "fmla v29.4s, v4.4s, v8.4s\n" + "tbz %x[n_channels], #1, 45f\n" + "ldr d10, [x19], #0x8\n" + "tbz %x[n_channels], #0, 46f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 46f\n" + "45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset + "ldr s10, [x19, #0x0]\n" + "46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr s0, [x3, #0x54]\n" + "add x19, x14, XZR\n" + "fmla v31.4s, v0.4s, v9.4s\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "tbz %x[n_channels], #1, 47f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 48f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 48f\n" + "47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End + "fmla v29.4s, v0.4s, v11.4s\n" + "add x19, x14, x4\n" + "tbz %x[n_channels], #1, 49f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 50f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 50f\n" + "49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr s1, [x3, #0x58]\n" + "add x19, x14, x13\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "fmla v30.4s, v1.4s, v5.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "tbz %x[n_channels], #1, 51f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 52f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 52f\n" + "51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End + "fmla v28.4s, v1.4s, v9.4s\n" + "ldr s2, [x3, #0x5c]\n" + "add x19, x14, x12\n" + "fmla v31.4s, v2.4s, v5.4s\n" + "fmla v30.4s, v2.4s, v6.4s\n" + "fmla v29.4s, v2.4s, v9.4s\n" + "tbz %x[n_channels], #1, 53f\n" + "ldr d11, [x19], #0x8\n" + "tbz %x[n_channels], #0, 54f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 54f\n" + "53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset + "ldr s11, [x19, #0x0]\n" + "54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr s3, [x3, #0x60]\n" + "add x19, x14, x11\n" + "fmla v31.4s, v3.4s, v6.4s\n" + "fmla v30.4s, v3.4s, v8.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 55f\n" + "ldr d12, [x19], #0x8\n" + "tbz %x[n_channels], #0, 56f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 56f\n" + "55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset + "ldr s12, [x19, #0x0]\n" + "56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr s4, [x3, #0x64]\n" + "add x19, x14, x10\n" + "fmla v31.4s, v4.4s, v8.4s\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 57f\n" + "ldr d9, [x19], #0x8\n" + "tbz %x[n_channels], #0, 58f\n" + "ld1 { v9.s }[2], [x19]\n" + "b 58f\n" + "57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset + "ldr s9, [x19, #0x0]\n" + "58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End + "fmla v28.4s, v4.4s, v9.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "tbz %x[n_channels], #1, 59f\n" + "mov x19, x7\n" + "st1 { v31.d }[0], [x19], x6\n" + "add x7, x7, #0x8\n" + "st1 { v30.d }[0], [x19]\n" + "mov x19, x9\n" + "st1 { v29.d }[0], [x19], x6\n" + "add x9, x9, #0x8\n" + "st1 { v28.d }[0], [x19]\n" + "tbz %x[n_channels], #0, 60f\n" + "mov x20, x7\n" + "st1 { v31.s }[2], [x20], x6\n" + "mov x19, x9\n" + "st1 { v30.s }[2], [x20]\n" + "st1 { v29.s }[2], [x19], x6\n" + "st1 { v28.s }[2], [x19]\n" + "b 60f\n" + "59:" // Tile loop: Oddments: Store: Bit 1: Unset + "mov x20, x7\n" + "st1 { v31.s }[0], [x20], x6\n" + "mov x19, x9\n" + "st1 { v30.s }[0], [x20]\n" + "st1 { v29.s }[0], [x19], x6\n" + "st1 { v28.s }[0], [x19]\n" + "60:" // Tile loop: Oddments: Store: Bit 1: End + + "61:" // Tile loop: End + "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x28, #0x1\n" + "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "add x27, x27, #0x1\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x27, x19\n" + "csel x27, x27, XZR, LT\n" + "csel x28, x28, x21, LT\n" + "cmp x28, x20\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..de66a8c485 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[36]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[0]; + inptrs[1] = input_ptrs[1]; + inptrs[2] = input_ptrs[6]; + inptrs[3] = input_ptrs[7]; + inptrs[4] = input_ptrs[2]; + inptrs[5] = input_ptrs[8]; + inptrs[6] = input_ptrs[3]; + inptrs[7] = input_ptrs[4]; + inptrs[8] = input_ptrs[11]; + inptrs[9] = input_ptrs[12]; + inptrs[10] = input_ptrs[9]; + inptrs[11] = input_ptrs[10]; + inptrs[12] = input_ptrs[5]; + inptrs[13] = input_ptrs[13]; + inptrs[14] = input_ptrs[14]; + inptrs[15] = input_ptrs[15]; + inptrs[16] = input_ptrs[16]; + inptrs[17] = input_ptrs[17]; + inptrs[18] = input_ptrs[18]; + inptrs[19] = input_ptrs[19]; + inptrs[20] = input_ptrs[20]; + inptrs[21] = input_ptrs[21]; + inptrs[22] = input_ptrs[22]; + inptrs[23] = input_ptrs[23]; + inptrs[24] = input_ptrs[24]; + inptrs[25] = input_ptrs[25]; + inptrs[26] = input_ptrs[26]; + inptrs[27] = input_ptrs[27]; + inptrs[28] = input_ptrs[28]; + inptrs[29] = input_ptrs[29]; + inptrs[30] = input_ptrs[30]; + inptrs[31] = input_ptrs[31]; + inptrs[32] = input_ptrs[32]; + inptrs[33] = input_ptrs[33]; + inptrs[34] = input_ptrs[34]; + inptrs[35] = input_ptrs[35]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x20, %x[params_struct], %[offsetof_args_min]\n" + "add x19, %x[params_struct], %[offsetof_args_max]\n" + "ld1r { v18.4s }, [x20]\n" + "ld1r { v17.4s }, [x19]\n" + "mov x14, #0x0\n" + "ldp x13, x12, [x21, #0x0]\n" + "mov x11, #0x10\n" // cntb _, ALL, #1 + "ldp x10, x9, [x21, #0x10]\n" + "sub x28, XZR, x11\n" + "lsr x27, %x[n_channels], #0x2\n" + "cbz x27, 3f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "cmp x11, x27, LSL #4\n" + "ldr q1, [x15, #0x20]\n" + "ldr q2, [x15, #0x30]\n" + "ldr q3, [x15, #0x40]\n" + "ldr q4, [x15, #0x50]\n" + "add x15, x15, #0x60\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldr q5, [x26, x14]\n" + "ldr q6, [x25, x14]\n" + "ldr q7, [x24, x14]\n" + "ldr q8, [x23, x14]\n" + "ldr q9, [x22, x14]\n" + "ldr q13, [x21, x14]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ldp x26, x25, [x16, #0x40]\n" + "ldr q11, [x20, x14]\n" + "ldr q12, [x19, x14]\n" + "ldr q10, [x26, x14]\n" + "ldr q14, [x25, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n" + "ldr x24, [x16, #0x50]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n" + "ldr x23, [x16, #0x58]\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n" + "ldr x22, [x16, #0x60]\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n" + "ldr q5, [x24, x14]\n" + "ldr q0, [x15, #0x0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x23, x14]\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "fmla v29.4s, v1.4s, v8.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v28.4s, v1.4s, v13.4s\n" + "ldr q1, [x15, #0x10]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "ldr q9, [x22, x14]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v13.4s\n" + "ldr x19, [x16, #0x78]\n" + "fmla v28.4s, v2.4s, v5.4s\n" + "ldr q2, [x15, #0x20]\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x21, x14]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v5.4s\n" + "ldr x25, [x16, #0x88]\n" + "fmla v28.4s, v3.4s, v6.4s\n" + "ldr q3, [x15, #0x30]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x20, x14]\n" + "fmla v30.4s, v4.4s, v9.4s\n" + "fmla v29.4s, v4.4s, v6.4s\n" + "ldr q9, [x19, x14]\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x15, #0x40]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.4s, v0.4s, v7.4s\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v30.4s, v0.4s, v8.4s\n" + "fmla v29.4s, v0.4s, v14.4s\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q0, [x15, #0x50]\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v31.4s, v1.4s, v8.4s\n" + "ldr q8, [x25, x14]\n" + "fmla v30.4s, v1.4s, v13.4s\n" + "fmla v29.4s, v1.4s, v11.4s\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q1, [x15, #0x60]\n" + "ldr x25, [x16, #0xc8]\n" + "fmla v31.4s, v2.4s, v13.4s\n" + "ldr q13, [x26, x14]\n" + "fmla v30.4s, v2.4s, v5.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v28.4s, v2.4s, v9.4s\n" + "ldr q2, [x15, #0x70]\n" + "ldr q16, [x15, #0x140]\n" + "fmla v31.4s, v3.4s, v5.4s\n" + "ldr q5, [x24, x14]\n" + "fmla v30.4s, v3.4s, v6.4s\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "ldr x24, [x16, #0xd0]\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr q3, [x15, #0x80]\n" + "fmla v31.4s, v4.4s, v6.4s\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0xd8]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "ldr q10, [x22, x14]\n" + "fmla v28.4s, v4.4s, v8.4s\n" + "ldr q4, [x15, #0x90]\n" + "ldr x22, [x16, #0xe0]\n" + "fmla v31.4s, v0.4s, v14.4s\n" + "ldr q14, [x19, x14]\n" + "fmla v30.4s, v0.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v5.4s\n" + "ldr x19, [x16, #0xf8]\n" + "fmla v28.4s, v0.4s, v6.4s\n" + "ldr q0, [x15, #0xa0]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x21, x14]\n" + "ldr x21, [x16, #0xe8]\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v6.4s\n" + "fmla v28.4s, v1.4s, v10.4s\n" + "ldr q1, [x15, #0xb0]\n" + "fmla v31.4s, v2.4s, v12.4s\n" + "ldr q12, [x20, x14]\n" + "ldr x20, [x16, #0xf0]\n" + "fmla v30.4s, v2.4s, v9.4s\n" + "fmla v29.4s, v2.4s, v10.4s\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr q2, [x15, #0xc0]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ldr q9, [x26, x14]\n" + "ldr x26, [x16, #0x100]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr q3, [x15, #0xd0]\n" + "fmla v31.4s, v4.4s, v13.4s\n" + "ldr q13, [x25, x14]\n" + "ldr x25, [x16, #0x108]\n" + "fmla v30.4s, v4.4s, v8.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "ldr q8, [x22, x14]\n" + "fmla v28.4s, v4.4s, v14.4s\n" + "ldr q4, [x15, #0xe0]\n" + "fmla v31.4s, v0.4s, v5.4s\n" + "ldr q5, [x24, x14]\n" + "ldr x24, [x16, #0x110]\n" + "fmla v30.4s, v0.4s, v6.4s\n" + "fmla v29.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v0.4s, v13.4s\n" + "ldr q0, [x15, #0xf0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0x118]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v1.4s, v13.4s\n" + "fmla v28.4s, v1.4s, v5.4s\n" + "ldr q1, [x15, #0x100]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr q10, [x21, x14]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v5.4s\n" + "fmla v28.4s, v2.4s, v6.4s\n" + "ldr q2, [x15, #0x110]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x20, x14]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v6.4s\n" + "fmla v28.4s, v3.4s, v8.4s\n" + "ldr q3, [x15, #0x120]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x19, x14]\n" + "fmla v30.4s, v4.4s, v14.4s\n" + "fmla v29.4s, v4.4s, v8.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x15, #0x130]\n" + "fmla v31.4s, v0.4s, v9.4s\n" + "ldr q9, [x26, x14]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x25, x14]\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "ldp x26, x25, [x16, #0x0]\n" + "ldr q0, [x15, #0x150]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "fmla v30.4s, v1.4s, v5.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "ldr q12, [x24, x14]\n" + "fmla v28.4s, v1.4s, v9.4s\n" + "ldr q1, [x15, #0x160]\n" + "fmla v31.4s, v2.4s, v5.4s\n" + "ldr q5, [x26, x11]\n" + "fmla v30.4s, v2.4s, v6.4s\n" + "fmla v29.4s, v2.4s, v9.4s\n" + "ldr q9, [x23, x14]\n" + "add x14, x14, #0x10\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "ldp x24, x23, [x16, #0x10]\n" + "ldp x22, x21, [x16, #0x20]\n" + "fmla v31.4s, v3.4s, v6.4s\n" + "ldr q6, [x25, x11]\n" + "fmla v30.4s, v3.4s, v8.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "ldr q7, [x24, x11]\n" + "ldr q13, [x21, x11]\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldp x20, x19, [x16, #0x30]\n" + "fmla v31.4s, v4.4s, v8.4s\n" + "ldr q8, [x23, x11]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "ldr q11, [x20, x11]\n" + "ldr q12, [x19, x11]\n" + "fmla v28.4s, v4.4s, v9.4s\n" + "ldr q9, [x22, x11]\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "ldp x26, x25, [x16, #0x40]\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "ldr q2, [x15, #0x170]\n" + "ldr q3, [x15, #0x180]\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "ldr q10, [x26, x11]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "ldr q14, [x25, x11]\n" + "add x11, x11, #0x10\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "str q31, [x13, x28]\n" + "cmp x11, x27, LSL #4\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q30, [x12, x28]\n" + "ldr q4, [x15, #0x190]\n" + "add x15, x15, #0x1a0\n" + "str q29, [x10, x28]\n" + "str q28, [x9, x28]\n" + "blt 1b\n" + "2:" // Channel tail + "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n" + "ldr x24, [x16, #0x50]\n" + "add x28, x28, #0x10\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n" + "ldr x23, [x16, #0x58]\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n" + "ldr x22, [x16, #0x60]\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n" + "ldr q5, [x24, x14]\n" + "ldr q0, [x15, #0x0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x23, x14]\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "fmla v29.4s, v1.4s, v8.4s\n" + "ldr x21, [x16, #0x68]\n" + "fmla v28.4s, v1.4s, v13.4s\n" + "ldr q1, [x15, #0x10]\n" + "ldr x20, [x16, #0x70]\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "ldr q9, [x22, x14]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v13.4s\n" + "ldr x19, [x16, #0x78]\n" + "fmla v28.4s, v2.4s, v5.4s\n" + "ldr q2, [x15, #0x20]\n" + "ldr x26, [x16, #0x80]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x21, x14]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v5.4s\n" + "ldr x25, [x16, #0x88]\n" + "fmla v28.4s, v3.4s, v6.4s\n" + "ldr q3, [x15, #0x30]\n" + "ldr x24, [x16, #0x90]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x20, x14]\n" + "fmla v30.4s, v4.4s, v9.4s\n" + "fmla v29.4s, v4.4s, v6.4s\n" + "ldr q9, [x19, x14]\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x15, #0x40]\n" + "ldr x23, [x16, #0x98]\n" + "fmla v31.4s, v0.4s, v7.4s\n" + "ldr x22, [x16, #0xa0]\n" + "fmla v30.4s, v0.4s, v8.4s\n" + "fmla v29.4s, v0.4s, v14.4s\n" + "ldr x21, [x16, #0xa8]\n" + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr q0, [x15, #0x50]\n" + "ldr x20, [x16, #0xb0]\n" + "fmla v31.4s, v1.4s, v8.4s\n" + "ldr q8, [x25, x14]\n" + "fmla v30.4s, v1.4s, v13.4s\n" + "fmla v29.4s, v1.4s, v11.4s\n" + "ldr x19, [x16, #0xb8]\n" + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr q1, [x15, #0x60]\n" + "ldr x25, [x16, #0xc8]\n" + "fmla v31.4s, v2.4s, v13.4s\n" + "ldr q13, [x26, x14]\n" + "fmla v30.4s, v2.4s, v5.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "ldr x26, [x16, #0xc0]\n" + "fmla v28.4s, v2.4s, v9.4s\n" + "ldr q2, [x15, #0x70]\n" + "fmla v31.4s, v3.4s, v5.4s\n" + "ldr q5, [x24, x14]\n" + "ldr x24, [x16, #0xd0]\n" + "fmla v30.4s, v3.4s, v6.4s\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr q3, [x15, #0x80]\n" + "fmla v31.4s, v4.4s, v6.4s\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0xd8]\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "ldr q10, [x22, x14]\n" + "fmla v28.4s, v4.4s, v8.4s\n" + "ldr q4, [x15, #0x90]\n" + "ldr x22, [x16, #0xe0]\n" + "fmla v31.4s, v0.4s, v14.4s\n" + "ldr q14, [x19, x14]\n" + "fmla v30.4s, v0.4s, v11.4s\n" + "fmla v29.4s, v0.4s, v5.4s\n" + "ldr x19, [x16, #0xf8]\n" + "fmla v28.4s, v0.4s, v6.4s\n" + "ldr q0, [x15, #0xa0]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr q11, [x21, x14]\n" + "ldr x21, [x16, #0xe8]\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v6.4s\n" + "fmla v28.4s, v1.4s, v10.4s\n" + "ldr q1, [x15, #0xb0]\n" + "fmla v31.4s, v2.4s, v12.4s\n" + "ldr q12, [x20, x14]\n" + "ldr x20, [x16, #0xf0]\n" + "fmla v30.4s, v2.4s, v9.4s\n" + "fmla v29.4s, v2.4s, v10.4s\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr q2, [x15, #0xc0]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ldr q9, [x26, x14]\n" + "ldr x26, [x16, #0x100]\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr q3, [x15, #0xd0]\n" + "fmla v31.4s, v4.4s, v13.4s\n" + "ldr q13, [x25, x14]\n" + "ldr x25, [x16, #0x108]\n" + "fmla v30.4s, v4.4s, v8.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "ldr q8, [x22, x14]\n" + "fmla v28.4s, v4.4s, v14.4s\n" + "ldr q4, [x15, #0xe0]\n" + "fmla v31.4s, v0.4s, v5.4s\n" + "ldr q5, [x24, x14]\n" + "ldr x24, [x16, #0x110]\n" + "fmla v30.4s, v0.4s, v6.4s\n" + "fmla v29.4s, v0.4s, v9.4s\n" + "fmla v28.4s, v0.4s, v13.4s\n" + "ldr q0, [x15, #0xf0]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr q6, [x23, x14]\n" + "ldr x23, [x16, #0x118]\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v1.4s, v13.4s\n" + "fmla v28.4s, v1.4s, v5.4s\n" + "ldr q1, [x15, #0x100]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr q10, [x21, x14]\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v5.4s\n" + "fmla v28.4s, v2.4s, v6.4s\n" + "ldr q2, [x15, #0x110]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr q11, [x20, x14]\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v6.4s\n" + "fmla v28.4s, v3.4s, v8.4s\n" + "ldr q3, [x15, #0x120]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr q12, [x19, x14]\n" + "fmla v30.4s, v4.4s, v14.4s\n" + "fmla v29.4s, v4.4s, v8.4s\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr q4, [x15, #0x130]\n" + "add x15, x15, #0x140\n" + "fmla v31.4s, v0.4s, v9.4s\n" + "ldr q9, [x26, x14]\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr q11, [x25, x14]\n" + "fmla v28.4s, v0.4s, v12.4s\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "fmla v30.4s, v1.4s, v5.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "ldr q12, [x24, x14]\n" + "fmla v28.4s, v1.4s, v9.4s\n" + "fmla v31.4s, v2.4s, v5.4s\n" + "fmla v30.4s, v2.4s, v6.4s\n" + "fmla v29.4s, v2.4s, v9.4s\n" + "ldr q9, [x23, x14]\n" + "add x14, x14, #0x10\n" + "fmla v28.4s, v2.4s, v11.4s\n" + "fmla v31.4s, v3.4s, v6.4s\n" + "fmla v30.4s, v3.4s, v8.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "fmla v28.4s, v3.4s, v12.4s\n" + "fmla v31.4s, v4.4s, v8.4s\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "fmla v28.4s, v4.4s, v9.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "str q31, [x13, x28]\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "str q30, [x12, x28]\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "str q29, [x10, x28]\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "str q28, [x9, x28]\n" + "3:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 60f\n" + "ldr q16, [x15, #0x0]\n" + "ldr q0, [x15, #0x10]\n" + "mov x28, x14\n" + "ldr q1, [x15, #0x20]\n" + "add x13, x13, x28\n" + "ldr q2, [x15, #0x30]\n" + "add x12, x12, x28\n" + "ldr q3, [x15, #0x40]\n" + "add x10, x10, x28\n" + "ldr q4, [x15, #0x50]\n" + "add x9, x9, x28\n" + "ldr x24, [x16, #0x10]\n" + "ldr x23, [x16, #0x18]\n" + "ldr x22, [x16, #0x20]\n" + "add x24, x24, x14\n" + "ldr x21, [x16, #0x28]\n" + "add x23, x23, x14\n" + "ldr x20, [x16, #0x30]\n" + "add x22, x22, x14\n" + "ldr x19, [x16, #0x38]\n" + "add x21, x21, x14\n" + "ldr x26, [x16, #0x40]\n" + "add x20, x20, x14\n" + "ldr x25, [x16, #0x48]\n" + "add x19, x19, x14\n" + "add x26, x26, x14\n" + "add x25, x25, x14\n" + "add x15, x15, #0x60\n" + "tbz %x[n_channels], #1, 4f\n" + "ld1 { v5.d }[0], [x26], #0x8\n" + "ld1 { v6.d }[0], [x25], #0x8\n" + "ld1 { v7.d }[0], [x24], #0x8\n" + "ld1 { v8.d }[0], [x23], #0x8\n" + "ld1 { v9.d }[0], [x22], #0x8\n" + "ld1 { v13.d }[0], [x21], #0x8\n" + "ld1 { v11.d }[0], [x20], #0x8\n" + "ld1 { v12.d }[0], [x19], #0x8\n" + "ld1 { v10.d }[0], [x26], #0x8\n" + "ld1 { v14.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 5f\n" + "ld1 { v7.s }[2], [x24], #0x4\n" + "ld1 { v8.s }[2], [x23], #0x4\n" + "ld1 { v5.s }[2], [x26], #0x4\n" + "ld1 { v6.s }[2], [x25], #0x4\n" + "ld1 { v9.s }[2], [x22], #0x4\n" + "ld1 { v13.s }[2], [x21], #0x4\n" + "ld1 { v11.s }[2], [x20], #0x4\n" + "ld1 { v12.s }[2], [x19], #0x4\n" + "ld1 { v10.s }[2], [x26], #0x4\n" + "ld1 { v14.s }[2], [x25], #0x4\n" + "b 5f\n" + "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset + "ld1 { v5.s }[0], [x26], #0x4\n" + "ld1 { v6.s }[0], [x25], #0x4\n" + "ld1 { v7.s }[0], [x24], #0x4\n" + "ld1 { v8.s }[0], [x23], #0x4\n" + "ld1 { v9.s }[0], [x22], #0x4\n" + "ld1 { v13.s }[0], [x21], #0x4\n" + "ld1 { v11.s }[0], [x20], #0x4\n" + "ld1 { v12.s }[0], [x19], #0x4\n" + "ld1 { v10.s }[0], [x26], #0x4\n" + "ld1 { v14.s }[0], [x25], #0x4\n" + "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End + "mov v31.16b, v16.16b\n fmla v31.4s, v0.4s, v5.4s\n" + "ldr x24, [x16, #0x50]\n" + "mov v30.16b, v16.16b\n fmla v30.4s, v0.4s, v6.4s\n" + "add x24, x24, x14\n" + "mov v29.16b, v16.16b\n fmla v29.4s, v0.4s, v7.4s\n" + "mov v28.16b, v16.16b\n fmla v28.4s, v0.4s, v8.4s\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "fmla v30.4s, v1.4s, v9.4s\n" + "fmla v29.4s, v1.4s, v8.4s\n" + "fmla v28.4s, v1.4s, v13.4s\n" + "fmla v31.4s, v2.4s, v9.4s\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v13.4s\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v5.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v5.s }[2], [x24], #0x4\n" + "b 7f\n" + "6:" // Oddments: Load input (1, 3): Bit 1: Unset + "ld1 { v5.s }[0], [x24], #0x4\n" + "7:" // Oddments: Load input (1, 3): Bit 1: End + "fmla v28.4s, v2.4s, v5.4s\n" + "ldr x23, [x16, #0x58]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "add x23, x23, x14\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v5.4s\n" + "tbz %x[n_channels], #1, 8f\n" + "ld1 { v6.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 9f\n" + "ld1 { v6.s }[2], [x23], #0x4\n" + "b 9f\n" + "8:" // Oddments: Load input (1, 4): Bit 1: Unset + "ld1 { v6.s }[0], [x23], #0x4\n" + "9:" // Oddments: Load input (1, 4): Bit 1: End + "fmla v28.4s, v3.4s, v6.4s\n" + "ldr x22, [x16, #0x60]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "add x22, x22, x14\n" + "tbz %x[n_channels], #1, 10f\n" + "ld1 { v9.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v9.s }[2], [x22], #0x4\n" + "b 11f\n" + "10:" // Oddments: Load input (0, 5): Bit 1: Unset + "ld1 { v9.s }[0], [x22], #0x4\n" + "11:" // Oddments: Load input (0, 5): Bit 1: End + "fmla v30.4s, v4.4s, v9.4s\n" + "ldr s0, [x15, #0x18]\n" + "fmla v29.4s, v4.4s, v6.4s\n" + "ldr x21, [x16, #0x68]\n" + "add x21, x21, x14\n" + "fmla v28.4s, v4.4s, v10.4s\n" + "fmla v31.4s, v0.4s, v7.4s\n" + "fmla v30.4s, v0.4s, v8.4s\n" + "fmla v29.4s, v0.4s, v14.4s\n" + "tbz %x[n_channels], #1, 12f\n" + "ld1 { v11.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v11.s }[2], [x21], #0x4\n" + "b 13f\n" + "12:" // Oddments: Load input (2, 1): Bit 1: Unset + "ld1 { v11.s }[0], [x21], #0x4\n" + "13:" // Oddments: Load input (2, 1): Bit 1: End + "fmla v28.4s, v0.4s, v11.4s\n" + "ldr s1, [x15, #0x1c]\n" + "fmla v31.4s, v1.4s, v8.4s\n" + "ldr x20, [x16, #0x70]\n" + "add x20, x20, x14\n" + "fmla v30.4s, v1.4s, v13.4s\n" + "fmla v29.4s, v1.4s, v11.4s\n" + "tbz %x[n_channels], #1, 14f\n" + "ld1 { v12.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 15f\n" + "ld1 { v12.s }[2], [x20], #0x4\n" + "b 15f\n" + "14:" // Oddments: Load input (2, 2): Bit 1: Unset + "ld1 { v12.s }[0], [x20], #0x4\n" + "15:" // Oddments: Load input (2, 2): Bit 1: End + "fmla v28.4s, v1.4s, v12.4s\n" + "ldr s2, [x15, #0x20]\n" + "fmla v31.4s, v2.4s, v13.4s\n" + "ldr x19, [x16, #0x78]\n" + "add x19, x19, x14\n" + "fmla v30.4s, v2.4s, v5.4s\n" + "fmla v29.4s, v2.4s, v12.4s\n" + "tbz %x[n_channels], #1, 16f\n" + "ld1 { v9.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 17f\n" + "ld1 { v9.s }[2], [x19], #0x4\n" + "b 17f\n" + "16:" // Oddments: Load input (2, 3): Bit 1: Unset + "ld1 { v9.s }[0], [x19], #0x4\n" + "17:" // Oddments: Load input (2, 3): Bit 1: End + "fmla v28.4s, v2.4s, v9.4s\n" + "ldr s3, [x15, #0x24]\n" + "fmla v31.4s, v3.4s, v5.4s\n" + "ldr x26, [x16, #0x80]\n" + "add x26, x26, x14\n" + "fmla v30.4s, v3.4s, v6.4s\n" + "fmla v29.4s, v3.4s, v9.4s\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v13.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 19f\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "b 19f\n" + "18:" // Oddments: Load input (2, 4): Bit 1: Unset + "ld1 { v13.s }[0], [x26], #0x4\n" + "19:" // Oddments: Load input (2, 4): Bit 1: End + "fmla v28.4s, v3.4s, v13.4s\n" + "ldr s4, [x15, #0x28]\n" + "fmla v31.4s, v4.4s, v6.4s\n" + "ldr x25, [x16, #0x88]\n" + "add x25, x25, x14\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v13.4s\n" + "tbz %x[n_channels], #1, 20f\n" + "ld1 { v8.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 21f\n" + "ld1 { v8.s }[2], [x25], #0x4\n" + "b 21f\n" + "20:" // Oddments: Load input (2, 5): Bit 1: Unset + "ld1 { v8.s }[0], [x25], #0x4\n" + "21:" // Oddments: Load input (2, 5): Bit 1: End + "fmla v28.4s, v4.4s, v8.4s\n" + "ldr s0, [x15, #0x2c]\n" + "fmla v31.4s, v0.4s, v14.4s\n" + "ldr x24, [x16, #0x90]\n" + "add x24, x24, x14\n" + "fmla v30.4s, v0.4s, v11.4s\n" + "tbz %x[n_channels], #1, 22f\n" + "ld1 { v5.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 23f\n" + "ld1 { v5.s }[2], [x24], #0x4\n" + "b 23f\n" + "22:" // Oddments: Load input (3, 0): Bit 1: Unset + "ld1 { v5.s }[0], [x24], #0x4\n" + "23:" // Oddments: Load input (3, 0): Bit 1: End + "fmla v29.4s, v0.4s, v5.4s\n" + "ldr x23, [x16, #0x98]\n" + "add x23, x23, x14\n" + "tbz %x[n_channels], #1, 24f\n" + "ld1 { v6.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 25f\n" + "ld1 { v6.s }[2], [x23], #0x4\n" + "b 25f\n" + "24:" // Oddments: Load input (3, 1): Bit 1: Unset + "ld1 { v6.s }[0], [x23], #0x4\n" + "25:" // Oddments: Load input (3, 1): Bit 1: End + "fmla v28.4s, v0.4s, v6.4s\n" + "ldr s1, [x15, #0x30]\n" + "fmla v31.4s, v1.4s, v11.4s\n" + "ldr x22, [x16, #0xa0]\n" + "add x22, x22, x14\n" + "fmla v30.4s, v1.4s, v12.4s\n" + "fmla v29.4s, v1.4s, v6.4s\n" + "tbz %x[n_channels], #1, 26f\n" + "ld1 { v10.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 27f\n" + "ld1 { v10.s }[2], [x22], #0x4\n" + "b 27f\n" + "26:" // Oddments: Load input (3, 2): Bit 1: Unset + "ld1 { v10.s }[0], [x22], #0x4\n" + "27:" // Oddments: Load input (3, 2): Bit 1: End + "fmla v28.4s, v1.4s, v10.4s\n" + "ldr s2, [x15, #0x34]\n" + "fmla v31.4s, v2.4s, v12.4s\n" + "ldr x21, [x16, #0xa8]\n" + "add x21, x21, x14\n" + "fmla v30.4s, v2.4s, v9.4s\n" + "fmla v29.4s, v2.4s, v10.4s\n" + "tbz %x[n_channels], #1, 28f\n" + "ld1 { v11.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 29f\n" + "ld1 { v11.s }[2], [x21], #0x4\n" + "b 29f\n" + "28:" // Oddments: Load input (3, 3): Bit 1: Unset + "ld1 { v11.s }[0], [x21], #0x4\n" + "29:" // Oddments: Load input (3, 3): Bit 1: End + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr s3, [x15, #0x38]\n" + "fmla v31.4s, v3.4s, v9.4s\n" + "ldr x20, [x16, #0xb0]\n" + "add x20, x20, x14\n" + "fmla v30.4s, v3.4s, v13.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 30f\n" + "ld1 { v12.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 31f\n" + "ld1 { v12.s }[2], [x20], #0x4\n" + "b 31f\n" + "30:" // Oddments: Load input (3, 4): Bit 1: Unset + "ld1 { v12.s }[0], [x20], #0x4\n" + "31:" // Oddments: Load input (3, 4): Bit 1: End + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr s4, [x15, #0x3c]\n" + "fmla v31.4s, v4.4s, v13.4s\n" + "ldr x19, [x16, #0xb8]\n" + "add x19, x19, x14\n" + "fmla v30.4s, v4.4s, v8.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 32f\n" + "ld1 { v14.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 33f\n" + "ld1 { v14.s }[2], [x19], #0x4\n" + "b 33f\n" + "32:" // Oddments: Load input (3, 5): Bit 1: Unset + "ld1 { v14.s }[0], [x19], #0x4\n" + "33:" // Oddments: Load input (3, 5): Bit 1: End + "fmla v28.4s, v4.4s, v14.4s\n" + "ldr s0, [x15, #0x40]\n" + "fmla v31.4s, v0.4s, v5.4s\n" + "ldr x26, [x16, #0xc0]\n" + "add x26, x26, x14\n" + "fmla v30.4s, v0.4s, v6.4s\n" + "tbz %x[n_channels], #1, 34f\n" + "ld1 { v9.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 35f\n" + "ld1 { v9.s }[2], [x26], #0x4\n" + "b 35f\n" + "34:" // Oddments: Load input (4, 0): Bit 1: Unset + "ld1 { v9.s }[0], [x26], #0x4\n" + "35:" // Oddments: Load input (4, 0): Bit 1: End + "fmla v29.4s, v0.4s, v9.4s\n" + "ldr x25, [x16, #0xc8]\n" + "add x25, x25, x14\n" + "tbz %x[n_channels], #1, 36f\n" + "ld1 { v13.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 37f\n" + "ld1 { v13.s }[2], [x25], #0x4\n" + "b 37f\n" + "36:" // Oddments: Load input (4, 1): Bit 1: Unset + "ld1 { v13.s }[0], [x25], #0x4\n" + "37:" // Oddments: Load input (4, 1): Bit 1: End + "fmla v28.4s, v0.4s, v13.4s\n" + "ldr s1, [x15, #0x44]\n" + "fmla v31.4s, v1.4s, v6.4s\n" + "ldr x24, [x16, #0xd0]\n" + "add x24, x24, x14\n" + "fmla v30.4s, v1.4s, v10.4s\n" + "fmla v29.4s, v1.4s, v13.4s\n" + "tbz %x[n_channels], #1, 38f\n" + "ld1 { v5.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 39f\n" + "ld1 { v5.s }[2], [x24], #0x4\n" + "b 39f\n" + "38:" // Oddments: Load input (4, 2): Bit 1: Unset + "ld1 { v5.s }[0], [x24], #0x4\n" + "39:" // Oddments: Load input (4, 2): Bit 1: End + "fmla v28.4s, v1.4s, v5.4s\n" + "ldr s2, [x15, #0x48]\n" + "fmla v31.4s, v2.4s, v10.4s\n" + "ldr x23, [x16, #0xd8]\n" + "add x23, x23, x14\n" + "fmla v30.4s, v2.4s, v11.4s\n" + "fmla v29.4s, v2.4s, v5.4s\n" + "tbz %x[n_channels], #1, 40f\n" + "ld1 { v6.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 41f\n" + "ld1 { v6.s }[2], [x23], #0x4\n" + "b 41f\n" + "40:" // Oddments: Load input (4, 3): Bit 1: Unset + "ld1 { v6.s }[0], [x23], #0x4\n" + "41:" // Oddments: Load input (4, 3): Bit 1: End + "fmla v28.4s, v2.4s, v6.4s\n" + "ldr s3, [x15, #0x4c]\n" + "fmla v31.4s, v3.4s, v11.4s\n" + "ldr x22, [x16, #0xe0]\n" + "add x22, x22, x14\n" + "fmla v30.4s, v3.4s, v12.4s\n" + "fmla v29.4s, v3.4s, v6.4s\n" + "tbz %x[n_channels], #1, 42f\n" + "ld1 { v8.d }[0], [x22], #0x8\n" + "tbz %x[n_channels], #0, 43f\n" + "ld1 { v8.s }[2], [x22], #0x4\n" + "b 43f\n" + "42:" // Oddments: Load input (4, 4): Bit 1: Unset + "ld1 { v8.s }[0], [x22], #0x4\n" + "43:" // Oddments: Load input (4, 4): Bit 1: End + "fmla v28.4s, v3.4s, v8.4s\n" + "ldr s4, [x15, #0x50]\n" + "fmla v31.4s, v4.4s, v12.4s\n" + "ldr x21, [x16, #0xe8]\n" + "add x21, x21, x14\n" + "fmla v30.4s, v4.4s, v14.4s\n" + "fmla v29.4s, v4.4s, v8.4s\n" + "tbz %x[n_channels], #1, 44f\n" + "ld1 { v10.d }[0], [x21], #0x8\n" + "tbz %x[n_channels], #0, 45f\n" + "ld1 { v10.s }[2], [x21], #0x4\n" + "b 45f\n" + "44:" // Oddments: Load input (4, 5): Bit 1: Unset + "ld1 { v10.s }[0], [x21], #0x4\n" + "45:" // Oddments: Load input (4, 5): Bit 1: End + "fmla v28.4s, v4.4s, v10.4s\n" + "ldr s0, [x15, #0x54]\n" + "fmla v31.4s, v0.4s, v9.4s\n" + "ldr x20, [x16, #0xf0]\n" + "add x20, x20, x14\n" + "fmla v30.4s, v0.4s, v13.4s\n" + "tbz %x[n_channels], #1, 46f\n" + "ld1 { v11.d }[0], [x20], #0x8\n" + "tbz %x[n_channels], #0, 47f\n" + "ld1 { v11.s }[2], [x20], #0x4\n" + "b 47f\n" + "46:" // Oddments: Load input (5, 0): Bit 1: Unset + "ld1 { v11.s }[0], [x20], #0x4\n" + "47:" // Oddments: Load input (5, 0): Bit 1: End + "fmla v29.4s, v0.4s, v11.4s\n" + "ldr x19, [x16, #0xf8]\n" + "add x19, x19, x14\n" + "tbz %x[n_channels], #1, 48f\n" + "ld1 { v12.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 49f\n" + "ld1 { v12.s }[2], [x19], #0x4\n" + "b 49f\n" + "48:" // Oddments: Load input (5, 1): Bit 1: Unset + "ld1 { v12.s }[0], [x19], #0x4\n" + "49:" // Oddments: Load input (5, 1): Bit 1: End + "fmla v28.4s, v0.4s, v12.4s\n" + "ldr s1, [x15, #0x58]\n" + "fmla v31.4s, v1.4s, v13.4s\n" + "ldr x26, [x16, #0x100]\n" + "add x26, x26, x14\n" + "fmla v30.4s, v1.4s, v5.4s\n" + "fmla v29.4s, v1.4s, v12.4s\n" + "tbz %x[n_channels], #1, 50f\n" + "ld1 { v9.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #0, 51f\n" + "ld1 { v9.s }[2], [x26], #0x4\n" + "b 51f\n" + "50:" // Oddments: Load input (5, 2): Bit 1: Unset + "ld1 { v9.s }[0], [x26], #0x4\n" + "51:" // Oddments: Load input (5, 2): Bit 1: End + "fmla v28.4s, v1.4s, v9.4s\n" + "ldr s2, [x15, #0x5c]\n" + "fmla v31.4s, v2.4s, v5.4s\n" + "ldr x25, [x16, #0x108]\n" + "add x25, x25, x14\n" + "fmla v30.4s, v2.4s, v6.4s\n" + "fmla v29.4s, v2.4s, v9.4s\n" + "tbz %x[n_channels], #1, 52f\n" + "ld1 { v11.d }[0], [x25], #0x8\n" + "tbz %x[n_channels], #0, 53f\n" + "ld1 { v11.s }[2], [x25], #0x4\n" + "b 53f\n" + "52:" // Oddments: Load input (5, 3): Bit 1: Unset + "ld1 { v11.s }[0], [x25], #0x4\n" + "53:" // Oddments: Load input (5, 3): Bit 1: End + "fmla v28.4s, v2.4s, v11.4s\n" + "ldr s3, [x15, #0x60]\n" + "fmla v31.4s, v3.4s, v6.4s\n" + "ldr x24, [x16, #0x110]\n" + "add x24, x24, x14\n" + "fmla v30.4s, v3.4s, v8.4s\n" + "fmla v29.4s, v3.4s, v11.4s\n" + "tbz %x[n_channels], #1, 54f\n" + "ld1 { v12.d }[0], [x24], #0x8\n" + "tbz %x[n_channels], #0, 55f\n" + "ld1 { v12.s }[2], [x24], #0x4\n" + "b 55f\n" + "54:" // Oddments: Load input (5, 4): Bit 1: Unset + "ld1 { v12.s }[0], [x24], #0x4\n" + "55:" // Oddments: Load input (5, 4): Bit 1: End + "fmla v28.4s, v3.4s, v12.4s\n" + "ldr s4, [x15, #0x64]\n" + "fmla v31.4s, v4.4s, v8.4s\n" + "ldr x23, [x16, #0x118]\n" + "add x23, x23, x14\n" + "fmla v30.4s, v4.4s, v10.4s\n" + "fmla v29.4s, v4.4s, v12.4s\n" + "tbz %x[n_channels], #1, 56f\n" + "ld1 { v9.d }[0], [x23], #0x8\n" + "tbz %x[n_channels], #0, 57f\n" + "ld1 { v9.s }[2], [x23], #0x4\n" + "b 57f\n" + "56:" // Oddments: Load input (5, 5): Bit 1: Unset + "ld1 { v9.s }[0], [x23], #0x4\n" + "57:" // Oddments: Load input (5, 5): Bit 1: End + "fmla v28.4s, v4.4s, v9.4s\n" + "fmax v31.4s, v31.4s, v18.4s\n" + "fmax v30.4s, v30.4s, v18.4s\n" + "fmax v29.4s, v29.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v17.4s\n" + "fmin v30.4s, v30.4s, v17.4s\n" + "fmin v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v18.4s\n" + "fmin v28.4s, v28.4s, v17.4s\n" + "tbz %x[n_channels], #1, 58f\n" + "st1 { v31.d }[0], [x13], #0x8\n" + "st1 { v30.d }[0], [x12], #0x8\n" + "st1 { v29.d }[0], [x10], #0x8\n" + "st1 { v28.d }[0], [x9], #0x8\n" + "tbz %x[n_channels], #0, 59f\n" + "st1 { v31.s }[2], [x13], #0x4\n" + "st1 { v30.s }[2], [x12], #0x4\n" + "st1 { v29.s }[2], [x10], #0x4\n" + "st1 { v28.s }[2], [x9], #0x4\n" + "b 59f\n" + "58:" // Oddments: Store: Bit 1: Unset + "st1 { v31.s }[0], [x13], #0x4\n" + "st1 { v30.s }[0], [x12], #0x4\n" + "st1 { v29.s }[0], [x10], #0x4\n" + "st1 { v28.s }[0], [x9], #0x4\n" + "59:" // Oddments: Store: Bit 1: End + + "60:" // End + + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp new file mode 100644 index 0000000000..0f6cecdc56 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); + +struct a64_fp32_nhwc_generic_output9_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int n_output_points = 9; + + kern_type kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl; + + a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..e8e817e9cc --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const void *params, + const void *bias, + const unsigned int n_points, + const unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ld1r { v4.4s }, [%x[minmax_vals]]\n" + "add x19, %x[minmax_vals], #0x4\n" + "mov x11, #0x0\n" + "ld1r { v3.4s }, [x19]\n" + "lsr x10, %x[n_channels], #0x2\n" + "cbz x10, 5f\n" + "1:" // Channel loop + "movi v25.16b, #0x0\n" + "cbz %x[bias], 2f\n" + "ldr q25, [%x[bias], x11]\n" + "2:" // Channel loop: Load bias: Done + "mov v24.16b, v25.16b\n" + "ldr q23, [%x[params], #0x0]\n" + "mov x20, %x[inptrs]\n" + "mov v22.16b, v25.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, %x[n_points], #0x1\n" + "mov v21.16b, v25.16b\n" + "ldr q2, [x9, x11]\n" + "mov v20.16b, v25.16b\n" + "add %x[params], %x[params], #0x10\n" + "mov v19.16b, v25.16b\n" + "ldr q1, [x28, x11]\n" + "mov v18.16b, v25.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v17.16b, v25.16b\n" + "ldr q0, [x27, x11]\n" + "mov v16.16b, v25.16b\n" + "ldr q31, [x26, x11]\n" + "ldp x25, x24, [x20], #0x10\n" + "ldr q30, [x25, x11]\n" + "ldr q29, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "ldr q28, [x23, x11]\n" + "ldr q27, [x22, x11]\n" + "ldr x21, [x20], #0x8\n" + "ldr q26, [x21, x11]\n" + "ble 4f\n" + "3:" // Channel loop: Planar loop + "fmla v25.4s, v2.4s, v23.4s\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, x19, #0x1\n" + "fmla v24.4s, v1.4s, v23.4s\n" + "ldr q2, [x9, x11]\n" + "fmla v22.4s, v0.4s, v23.4s\n" + "fmla v21.4s, v31.4s, v23.4s\n" + "ldr q1, [x28, x11]\n" + "fmla v20.4s, v30.4s, v23.4s\n" + "ldp x27, x26, [x20], #0x10\n" + "fmla v19.4s, v29.4s, v23.4s\n" + "fmla v18.4s, v28.4s, v23.4s\n" + "ldr q0, [x27, x11]\n" + "fmla v17.4s, v27.4s, v23.4s\n" + "fmla v16.4s, v26.4s, v23.4s\n" + "ldr q23, [%x[params], #0x0]\n" + "add %x[params], %x[params], #0x10\n" + "ldr q31, [x26, x11]\n" + "ldp x25, x24, [x20], #0x10\n" + "ldr q30, [x25, x11]\n" + "ldr q29, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "ldr q28, [x23, x11]\n" + "ldr q27, [x22, x11]\n" + "ldr x21, [x20], #0x8\n" + "ldr q26, [x21, x11]\n" + "bgt 3b\n" + "4:" // Channel loop: Planar tail + "fmla v25.4s, v2.4s, v23.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "fmla v24.4s, v1.4s, v23.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "fmla v22.4s, v0.4s, v23.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "fmla v21.4s, v31.4s, v23.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "fmla v20.4s, v30.4s, v23.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmla v19.4s, v29.4s, v23.4s\n" + "fmla v18.4s, v28.4s, v23.4s\n" + "fmla v17.4s, v27.4s, v23.4s\n" + "fmla v16.4s, v26.4s, v23.4s\n" + "fmax v25.4s, v25.4s, v4.4s\n" + "fmax v24.4s, v24.4s, v4.4s\n" + "fmax v22.4s, v22.4s, v4.4s\n" + "fmin v25.4s, v25.4s, v3.4s\n" + "str q25, [x27, x11]\n" + "fmin v24.4s, v24.4s, v3.4s\n" + "fmin v22.4s, v22.4s, v3.4s\n" + "str q24, [x26, x11]\n" + "fmax v21.4s, v21.4s, v4.4s\n" + "fmax v20.4s, v20.4s, v4.4s\n" + "str q22, [x25, x11]\n" + "fmax v19.4s, v19.4s, v4.4s\n" + "fmax v18.4s, v18.4s, v4.4s\n" + "fmin v21.4s, v21.4s, v3.4s\n" + "str q21, [x24, x11]\n" + "fmin v20.4s, v20.4s, v3.4s\n" + "fmin v19.4s, v19.4s, v3.4s\n" + "str q20, [x23, x11]\n" + "fmin v18.4s, v18.4s, v3.4s\n" + "fmax v17.4s, v17.4s, v4.4s\n" + "str q19, [x22, x11]\n" + "fmax v16.4s, v16.4s, v4.4s\n" + "str q18, [x21, x11]\n" + "fmin v17.4s, v17.4s, v3.4s\n" + "fmin v16.4s, v16.4s, v3.4s\n" + "str q17, [x20, x11]\n" + "str q16, [x19, x11]\n" + "add x11, x11, #0x10\n" + "cmp x11, x10, LSL #4\n" + "blt 1b\n" + "5:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 17f\n" + "movi v25.16b, #0x0\n" + "cbz %x[bias], 8f\n" + "add x19, %x[bias], x11\n" + "tbz %x[n_channels], #1, 6f\n" + "ld1 { v25.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v25.s }[2], [x19], #0x4\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 1: Unset + "tbz %x[n_channels], #0, 7f\n" + "ld1 { v25.s }[0], [x19], #0x4\n" + "7:" // Oddments: Load bias: Bit 1: End + + "8:" // Oddments: Load bias: Done + "mov v24.16b, v25.16b\n" + "ldr q23, [%x[params], #0x0]\n" + "mov x20, %x[inptrs]\n" + "mov v22.16b, v25.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "add %x[params], %x[params], #0x10\n" + "mov v21.16b, v25.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v20.16b, v25.16b\n" + "add x9, x9, x11\n" + "mov v19.16b, v25.16b\n" + "ldp x25, x24, [x20], #0x10\n" + "mov v18.16b, v25.16b\n" + "add x28, x28, x11\n" + "mov v17.16b, v25.16b\n" + "ldp x23, x22, [x20], #0x10\n" + "mov v16.16b, v25.16b\n" + "add x27, x27, x11\n" + "ldr x21, [x20], #0x8\n" + "add x26, x26, x11\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 9f\n" + "ldr d2, [x9], #0x8\n" + "ldr d1, [x28], #0x8\n" + "ldr d0, [x27], #0x8\n" + "ldr d31, [x26], #0x8\n" + "ldr d30, [x25], #0x8\n" + "ldr d29, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v2.s }[2], [x9], #0x4\n" + "ld1 { v1.s }[2], [x28], #0x4\n" + "ld1 { v0.s }[2], [x27], #0x4\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v30.s }[2], [x25], #0x4\n" + "ld1 { v29.s }[2], [x24], #0x4\n" + "ld1 { v28.s }[2], [x23], #0x4\n" + "ld1 { v27.s }[2], [x22], #0x4\n" + "ld1 { v26.s }[2], [x21], #0x4\n" + "b 10f\n" + "9:" // Oddments: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ldr s2, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s0, [x27], #0x4\n" + "ldr s31, [x26], #0x4\n" + "ldr s30, [x25], #0x4\n" + "ldr s29, [x24], #0x4\n" + "ldr s28, [x23], #0x4\n" + "ldr s27, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "10:" // Oddments: Load: Bit 1: End + "subs x19, %x[n_points], #0x1\n" + "ble 14f\n" + "11:" // Oddments: Planar loop + "fmla v25.4s, v2.4s, v23.4s\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "fmla v24.4s, v1.4s, v23.4s\n" + "ldp x27, x26, [x20], #0x10\n" + "fmla v22.4s, v0.4s, v23.4s\n" + "ldp x25, x24, [x20], #0x10\n" + "fmla v21.4s, v31.4s, v23.4s\n" + "add x28, x28, x11\n" + "fmla v20.4s, v30.4s, v23.4s\n" + "ldp x23, x22, [x20], #0x10\n" + "fmla v19.4s, v29.4s, v23.4s\n" + "add x27, x27, x11\n" + "fmla v18.4s, v28.4s, v23.4s\n" + "ldr x21, [x20], #0x8\n" + "fmla v17.4s, v27.4s, v23.4s\n" + "add x26, x26, x11\n" + "fmla v16.4s, v26.4s, v23.4s\n" + "ldr q23, [%x[params], #0x0]\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "add %x[params], %x[params], #0x10\n" + "tbz %x[n_channels], #1, 12f\n" + "ldr d2, [x9], #0x8\n" + "ldr d1, [x28], #0x8\n" + "ldr d0, [x27], #0x8\n" + "ldr d31, [x26], #0x8\n" + "ldr d30, [x25], #0x8\n" + "ldr d29, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d27, [x22], #0x8\n" + "ldr d26, [x21], #0x8\n" + "tbz %x[n_channels], #0, 13f\n" + "ld1 { v2.s }[2], [x9], #0x4\n" + "ld1 { v1.s }[2], [x28], #0x4\n" + "ld1 { v0.s }[2], [x27], #0x4\n" + "ld1 { v31.s }[2], [x26], #0x4\n" + "ld1 { v30.s }[2], [x25], #0x4\n" + "ld1 { v29.s }[2], [x24], #0x4\n" + "ld1 { v28.s }[2], [x23], #0x4\n" + "ld1 { v27.s }[2], [x22], #0x4\n" + "ld1 { v26.s }[2], [x21], #0x4\n" + "b 13f\n" + "12:" // Oddments: Planar loop: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 13f\n" + "ldr s2, [x9], #0x4\n" + "ldr s1, [x28], #0x4\n" + "ldr s0, [x27], #0x4\n" + "ldr s31, [x26], #0x4\n" + "ldr s30, [x25], #0x4\n" + "ldr s29, [x24], #0x4\n" + "ldr s28, [x23], #0x4\n" + "ldr s27, [x22], #0x4\n" + "ldr s26, [x21], #0x4\n" + "13:" // Oddments: Planar loop: Load: Bit 1: End + "subs x19, x19, #0x1\n" + "bgt 11b\n" + "14:" // Oddments: Planar tail + "fmla v25.4s, v2.4s, v23.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "add x27, x27, x11\n" + "fmla v24.4s, v1.4s, v23.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "fmla v22.4s, v0.4s, v23.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "add x26, x26, x11\n" + "fmla v21.4s, v31.4s, v23.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "fmla v20.4s, v30.4s, v23.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x25, x25, x11\n" + "fmla v19.4s, v29.4s, v23.4s\n" + "add x24, x24, x11\n" + "fmla v18.4s, v28.4s, v23.4s\n" + "add x23, x23, x11\n" + "fmla v17.4s, v27.4s, v23.4s\n" + "add x22, x22, x11\n" + "fmla v16.4s, v26.4s, v23.4s\n" + "add x21, x21, x11\n" + "fmax v25.4s, v25.4s, v4.4s\n" + "add x20, x20, x11\n" + "fmax v24.4s, v24.4s, v4.4s\n" + "add x19, x19, x11\n" + "fmax v22.4s, v22.4s, v4.4s\n" + "fmin v25.4s, v25.4s, v3.4s\n" + "fmin v24.4s, v24.4s, v3.4s\n" + "fmin v22.4s, v22.4s, v3.4s\n" + "fmax v21.4s, v21.4s, v4.4s\n" + "fmax v20.4s, v20.4s, v4.4s\n" + "fmax v19.4s, v19.4s, v4.4s\n" + "fmin v21.4s, v21.4s, v3.4s\n" + "fmin v20.4s, v20.4s, v3.4s\n" + "fmin v19.4s, v19.4s, v3.4s\n" + "fmax v18.4s, v18.4s, v4.4s\n" + "fmax v17.4s, v17.4s, v4.4s\n" + "fmax v16.4s, v16.4s, v4.4s\n" + "fmin v18.4s, v18.4s, v3.4s\n" + "fmin v17.4s, v17.4s, v3.4s\n" + "fmin v16.4s, v16.4s, v3.4s\n" + "tbz %x[n_channels], #1, 15f\n" + "st1 { v25.d }[0], [x27], #0x8\n" + "st1 { v24.d }[0], [x26], #0x8\n" + "st1 { v22.d }[0], [x25], #0x8\n" + "st1 { v21.d }[0], [x24], #0x8\n" + "st1 { v20.d }[0], [x23], #0x8\n" + "st1 { v19.d }[0], [x22], #0x8\n" + "st1 { v18.d }[0], [x21], #0x8\n" + "st1 { v17.d }[0], [x20], #0x8\n" + "st1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 16f\n" + "st1 { v25.s }[2], [x27], #0x4\n" + "st1 { v24.s }[2], [x26], #0x4\n" + "st1 { v22.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x24], #0x4\n" + "st1 { v20.s }[2], [x23], #0x4\n" + "st1 { v19.s }[2], [x22], #0x4\n" + "st1 { v18.s }[2], [x21], #0x4\n" + "st1 { v17.s }[2], [x20], #0x4\n" + "st1 { v16.s }[2], [x19], #0x4\n" + "b 16f\n" + "15:" // Oddments: Store: Bit 1: Unset + "tbz %x[n_channels], #0, 16f\n" + "st1 { v25.s }[0], [x27], #0x4\n" + "st1 { v24.s }[0], [x26], #0x4\n" + "st1 { v22.s }[0], [x25], #0x4\n" + "st1 { v21.s }[0], [x24], #0x4\n" + "st1 { v20.s }[0], [x23], #0x4\n" + "st1 { v19.s }[0], [x22], #0x4\n" + "st1 { v18.s }[0], [x21], #0x4\n" + "st1 { v17.s }[0], [x20], #0x4\n" + "st1 { v16.s }[0], [x19], #0x4\n" + "16:" // Oddments: Store: Bit 1: End + + "17:" // End + + : [params] "+&r" (params) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp new file mode 100644 index 0000000000..60f5ddd68f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + +struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 3; + constexpr static unsigned int output_cols = 3; + + constexpr static unsigned int input_rows = 7; + constexpr static unsigned int input_cols = 7; + constexpr static unsigned int input_col_quads = 2; + + kern_type kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl; + + a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..5e334ec7b8 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const void *params, + const unsigned int n_output_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ldp x14, x13, [%x[outptrs], #0x0]\n" + "add x12, %x[clamps], #0x4\n" + "ldp x11, x10, [%x[outptrs], #0x10]\n" + "mov x9, #0x0\n" + "ldp x28, x27, [%x[outptrs], #0x20]\n" + "mov x26, #0x0\n" + "ldp x25, x24, [%x[outptrs], #0x30]\n" + "lsr x23, %x[channel_multiplier], #0x2\n" + "ldr x22, [%x[outptrs], #0x40]\n" + "ldr x21, [%x[inptrs], #0x0]\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "ldr q0, [x21, #0x0]\n" + "ldr q1, [x21, #0x10]\n" + "ldr q2, [x20, #0x0]\n" + "ldr q3, [x20, #0x10]\n" + "ldr q4, [x19, #0x0]\n" + "ldr q5, [x19, #0x10]\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "ldr x19, [%x[inptrs], #0x28]\n" + "ldr q6, [x21, #0x0]\n" + "ldr q7, [x21, #0x10]\n" + "ldr q8, [x20, #0x0]\n" + "ldr q9, [x20, #0x10]\n" + "ldr q10, [x19, #0x0]\n" + "ldr q11, [x19, #0x10]\n" + "ldr x19, [%x[inptrs], #0x30]\n" + "ld1r { v24.4s }, [%x[clamps]]\n" + "ld1r { v23.4s }, [x12]\n" + "ldr q12, [x19, #0x0]\n" + "ldr q13, [x19, #0x10]\n" + "cbz x23, 3f\n" + "ldr q14, [%x[params], #0x0]\n" + "mov v15.16b, v14.16b\n" + "ldr q31, [%x[params], #0x10]\n" + "subs x23, x23, #0x1\n" + "mov v16.16b, v14.16b\n" + "ldr q30, [%x[params], #0x20]\n" + "mov v17.16b, v14.16b\n" + "ldr q29, [%x[params], #0x30]\n" + "add %x[params], %x[params], #0x40\n" + "mov v18.16b, v14.16b\n" + "mov v19.16b, v14.16b\n" + "mov v20.16b, v14.16b\n" + "mov v21.16b, v14.16b\n" + "mov v22.16b, v14.16b\n" + "beq 2f\n" + "1:" // Output channel complete vector loop + "fmla v14.4s, v31.4s, v0.s[0]\n" + "add x9, x9, #0x4\n" + "fmla v15.4s, v31.4s, v0.s[2]\n" + "subs x23, x23, #0x1\n" + "fmla v16.4s, v31.4s, v1.s[0]\n" + "fmla v17.4s, v31.4s, v4.s[0]\n" + "fmla v18.4s, v31.4s, v4.s[2]\n" + "fmla v19.4s, v31.4s, v5.s[0]\n" + "fmla v20.4s, v31.4s, v8.s[0]\n" + "fmla v21.4s, v31.4s, v8.s[2]\n" + "fmla v22.4s, v31.4s, v9.s[0]\n" + "ldr q31, [%x[params], #0x0]\n" + "fmla v14.4s, v30.4s, v0.s[1]\n" + "fmla v15.4s, v30.4s, v0.s[3]\n" + "fmla v16.4s, v30.4s, v1.s[1]\n" + "fmla v17.4s, v30.4s, v4.s[1]\n" + "fmla v18.4s, v30.4s, v4.s[3]\n" + "fmla v19.4s, v30.4s, v5.s[1]\n" + "fmla v20.4s, v30.4s, v8.s[1]\n" + "fmla v21.4s, v30.4s, v8.s[3]\n" + "fmla v22.4s, v30.4s, v9.s[1]\n" + "ldr q30, [%x[params], #0x10]\n" + "fmla v14.4s, v29.4s, v0.s[2]\n" + "fmla v15.4s, v29.4s, v1.s[0]\n" + "fmla v16.4s, v29.4s, v1.s[2]\n" + "fmla v17.4s, v29.4s, v4.s[2]\n" + "fmla v18.4s, v29.4s, v5.s[0]\n" + "fmla v19.4s, v29.4s, v5.s[2]\n" + "fmla v20.4s, v29.4s, v8.s[2]\n" + "fmla v21.4s, v29.4s, v9.s[0]\n" + "fmla v22.4s, v29.4s, v9.s[2]\n" + "ldr q29, [%x[params], #0x20]\n" + "fmla v14.4s, v31.4s, v2.s[0]\n" + "fmla v15.4s, v31.4s, v2.s[2]\n" + "fmla v16.4s, v31.4s, v3.s[0]\n" + "fmla v17.4s, v31.4s, v6.s[0]\n" + "fmla v18.4s, v31.4s, v6.s[2]\n" + "fmla v19.4s, v31.4s, v7.s[0]\n" + "fmla v20.4s, v31.4s, v10.s[0]\n" + "fmla v21.4s, v31.4s, v10.s[2]\n" + "fmla v22.4s, v31.4s, v11.s[0]\n" + "ldr q31, [%x[params], #0x30]\n" + "fmla v14.4s, v30.4s, v2.s[1]\n" + "fmla v15.4s, v30.4s, v2.s[3]\n" + "fmla v16.4s, v30.4s, v3.s[1]\n" + "fmla v17.4s, v30.4s, v6.s[1]\n" + "fmla v18.4s, v30.4s, v6.s[3]\n" + "fmla v19.4s, v30.4s, v7.s[1]\n" + "fmla v20.4s, v30.4s, v10.s[1]\n" + "fmla v21.4s, v30.4s, v10.s[3]\n" + "fmla v22.4s, v30.4s, v11.s[1]\n" + "ldr q30, [%x[params], #0x40]\n" + "fmla v14.4s, v29.4s, v2.s[2]\n" + "fmla v15.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[2]\n" + "fmla v17.4s, v29.4s, v6.s[2]\n" + "fmla v18.4s, v29.4s, v7.s[0]\n" + "fmla v19.4s, v29.4s, v7.s[2]\n" + "fmla v20.4s, v29.4s, v10.s[2]\n" + "fmla v21.4s, v29.4s, v11.s[0]\n" + "fmla v22.4s, v29.4s, v11.s[2]\n" + "ldr q29, [%x[params], #0x50]\n" + "fmla v14.4s, v31.4s, v4.s[0]\n" + "fmla v15.4s, v31.4s, v4.s[2]\n" + "fmla v16.4s, v31.4s, v5.s[0]\n" + "fmla v17.4s, v31.4s, v8.s[0]\n" + "fmla v18.4s, v31.4s, v8.s[2]\n" + "fmla v19.4s, v31.4s, v9.s[0]\n" + "fmla v20.4s, v31.4s, v12.s[0]\n" + "fmla v21.4s, v31.4s, v12.s[2]\n" + "fmla v22.4s, v31.4s, v13.s[0]\n" + "ldr q31, [%x[params], #0x70]\n" + "fmla v14.4s, v30.4s, v4.s[1]\n" + "fmla v15.4s, v30.4s, v4.s[3]\n" + "fmla v16.4s, v30.4s, v5.s[1]\n" + "fmla v17.4s, v30.4s, v8.s[1]\n" + "fmla v18.4s, v30.4s, v8.s[3]\n" + "fmla v19.4s, v30.4s, v9.s[1]\n" + "fmla v20.4s, v30.4s, v12.s[1]\n" + "fmla v21.4s, v30.4s, v12.s[3]\n" + "fmla v22.4s, v30.4s, v13.s[1]\n" + "ldr q30, [%x[params], #0x80]\n" + "fmla v14.4s, v29.4s, v4.s[2]\n" + "fmla v15.4s, v29.4s, v5.s[0]\n" + "fmla v16.4s, v29.4s, v5.s[2]\n" + "fmla v17.4s, v29.4s, v8.s[2]\n" + "fmla v18.4s, v29.4s, v9.s[0]\n" + "fmla v19.4s, v29.4s, v9.s[2]\n" + "fmla v20.4s, v29.4s, v12.s[2]\n" + "fmla v21.4s, v29.4s, v13.s[0]\n" + "fmla v22.4s, v29.4s, v13.s[2]\n" + "ldr q29, [%x[params], #0x90]\n" + "fmin v14.4s, v14.4s, v23.4s\n" + "fmin v15.4s, v15.4s, v23.4s\n" + "fmin v16.4s, v16.4s, v23.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "str q14, [x14, x26]\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "ldr q14, [%x[params], #0x60]\n" + "add %x[params], %x[params], #0xa0\n" + "fmin v17.4s, v17.4s, v23.4s\n" + "str q15, [x13, x26]\n" + "fmin v18.4s, v18.4s, v23.4s\n" + "fmin v19.4s, v19.4s, v23.4s\n" + "str q16, [x11, x26]\n" + "fmin v20.4s, v20.4s, v23.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "str q17, [x10, x26]\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "str q18, [x28, x26]\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmin v21.4s, v21.4s, v23.4s\n" + "str q19, [x27, x26]\n" + "fmin v22.4s, v22.4s, v23.4s\n" + "str q20, [x25, x26]\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "mov v15.16b, v14.16b\n" + "str q21, [x24, x26]\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "mov v16.16b, v14.16b\n" + "str q22, [x22, x26]\n" + "mov v17.16b, v14.16b\n" + "add x26, x26, #0x10\n" + "mov v18.16b, v14.16b\n" + "mov v19.16b, v14.16b\n" + "mov v20.16b, v14.16b\n" + "mov v21.16b, v14.16b\n" + "mov v22.16b, v14.16b\n" + "bgt 1b\n" + "2:" // Output channel complete vector tail + "fmla v14.4s, v31.4s, v0.s[0]\n" + "fmla v15.4s, v31.4s, v0.s[2]\n" + "fmla v16.4s, v31.4s, v1.s[0]\n" + "fmla v17.4s, v31.4s, v4.s[0]\n" + "fmla v18.4s, v31.4s, v4.s[2]\n" + "fmla v19.4s, v31.4s, v5.s[0]\n" + "fmla v20.4s, v31.4s, v8.s[0]\n" + "fmla v21.4s, v31.4s, v8.s[2]\n" + "fmla v22.4s, v31.4s, v9.s[0]\n" + "ldr q31, [%x[params], #0x0]\n" + "fmla v14.4s, v30.4s, v0.s[1]\n" + "fmla v15.4s, v30.4s, v0.s[3]\n" + "fmla v16.4s, v30.4s, v1.s[1]\n" + "fmla v17.4s, v30.4s, v4.s[1]\n" + "fmla v18.4s, v30.4s, v4.s[3]\n" + "fmla v19.4s, v30.4s, v5.s[1]\n" + "fmla v20.4s, v30.4s, v8.s[1]\n" + "fmla v21.4s, v30.4s, v8.s[3]\n" + "fmla v22.4s, v30.4s, v9.s[1]\n" + "ldr q30, [%x[params], #0x10]\n" + "fmla v14.4s, v29.4s, v0.s[2]\n" + "fmla v15.4s, v29.4s, v1.s[0]\n" + "fmla v16.4s, v29.4s, v1.s[2]\n" + "fmla v17.4s, v29.4s, v4.s[2]\n" + "fmla v18.4s, v29.4s, v5.s[0]\n" + "fmla v19.4s, v29.4s, v5.s[2]\n" + "fmla v20.4s, v29.4s, v8.s[2]\n" + "fmla v21.4s, v29.4s, v9.s[0]\n" + "fmla v22.4s, v29.4s, v9.s[2]\n" + "ldr q29, [%x[params], #0x20]\n" + "fmla v14.4s, v31.4s, v2.s[0]\n" + "fmla v15.4s, v31.4s, v2.s[2]\n" + "fmla v16.4s, v31.4s, v3.s[0]\n" + "fmla v17.4s, v31.4s, v6.s[0]\n" + "fmla v18.4s, v31.4s, v6.s[2]\n" + "fmla v19.4s, v31.4s, v7.s[0]\n" + "fmla v20.4s, v31.4s, v10.s[0]\n" + "fmla v21.4s, v31.4s, v10.s[2]\n" + "fmla v22.4s, v31.4s, v11.s[0]\n" + "ldr q31, [%x[params], #0x30]\n" + "fmla v14.4s, v30.4s, v2.s[1]\n" + "fmla v15.4s, v30.4s, v2.s[3]\n" + "fmla v16.4s, v30.4s, v3.s[1]\n" + "fmla v17.4s, v30.4s, v6.s[1]\n" + "fmla v18.4s, v30.4s, v6.s[3]\n" + "fmla v19.4s, v30.4s, v7.s[1]\n" + "fmla v20.4s, v30.4s, v10.s[1]\n" + "fmla v21.4s, v30.4s, v10.s[3]\n" + "fmla v22.4s, v30.4s, v11.s[1]\n" + "ldr q30, [%x[params], #0x40]\n" + "fmla v14.4s, v29.4s, v2.s[2]\n" + "fmla v15.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[2]\n" + "fmla v17.4s, v29.4s, v6.s[2]\n" + "fmla v18.4s, v29.4s, v7.s[0]\n" + "fmla v19.4s, v29.4s, v7.s[2]\n" + "fmla v20.4s, v29.4s, v10.s[2]\n" + "fmla v21.4s, v29.4s, v11.s[0]\n" + "fmla v22.4s, v29.4s, v11.s[2]\n" + "ldr q29, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "fmla v14.4s, v31.4s, v4.s[0]\n" + "fmla v15.4s, v31.4s, v4.s[2]\n" + "fmla v16.4s, v31.4s, v5.s[0]\n" + "fmla v17.4s, v31.4s, v8.s[0]\n" + "fmla v18.4s, v31.4s, v8.s[2]\n" + "fmla v19.4s, v31.4s, v9.s[0]\n" + "fmla v20.4s, v31.4s, v12.s[0]\n" + "fmla v21.4s, v31.4s, v12.s[2]\n" + "fmla v22.4s, v31.4s, v13.s[0]\n" + "fmla v14.4s, v30.4s, v4.s[1]\n" + "fmla v15.4s, v30.4s, v4.s[3]\n" + "fmla v16.4s, v30.4s, v5.s[1]\n" + "fmla v17.4s, v30.4s, v8.s[1]\n" + "fmla v18.4s, v30.4s, v8.s[3]\n" + "fmla v19.4s, v30.4s, v9.s[1]\n" + "fmla v20.4s, v30.4s, v12.s[1]\n" + "fmla v21.4s, v30.4s, v12.s[3]\n" + "fmla v22.4s, v30.4s, v13.s[1]\n" + "fmla v14.4s, v29.4s, v4.s[2]\n" + "fmla v15.4s, v29.4s, v5.s[0]\n" + "fmla v16.4s, v29.4s, v5.s[2]\n" + "fmla v17.4s, v29.4s, v8.s[2]\n" + "fmla v18.4s, v29.4s, v9.s[0]\n" + "fmla v19.4s, v29.4s, v9.s[2]\n" + "fmla v20.4s, v29.4s, v12.s[2]\n" + "fmla v21.4s, v29.4s, v13.s[0]\n" + "fmla v22.4s, v29.4s, v13.s[2]\n" + "fmin v14.4s, v14.4s, v23.4s\n" + "fmin v15.4s, v15.4s, v23.4s\n" + "fmin v16.4s, v16.4s, v23.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "str q14, [x14, x26]\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "str q15, [x13, x26]\n" + "fmin v17.4s, v17.4s, v23.4s\n" + "fmin v18.4s, v18.4s, v23.4s\n" + "str q16, [x11, x26]\n" + "fmin v19.4s, v19.4s, v23.4s\n" + "fmin v20.4s, v20.4s, v23.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "str q17, [x10, x26]\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "str q18, [x28, x26]\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmin v21.4s, v21.4s, v23.4s\n" + "str q19, [x27, x26]\n" + "fmin v22.4s, v22.4s, v23.4s\n" + "str q20, [x25, x26]\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "str q21, [x24, x26]\n" + "str q22, [x22, x26]\n" + "add x26, x26, #0x10\n" + "3:" // Output channel oddments + "tst %x[channel_multiplier], #0x3\n" + "beq 6f\n" + "ldr q14, [%x[params], #0x0]\n" + "mov v15.16b, v14.16b\n" + "ldr q31, [%x[params], #0x10]\n" + "mov v16.16b, v14.16b\n" + "ldr q30, [%x[params], #0x20]\n" + "mov v17.16b, v14.16b\n" + "ldr q29, [%x[params], #0x30]\n" + "mov v18.16b, v14.16b\n" + "mov v19.16b, v14.16b\n" + "mov v20.16b, v14.16b\n" + "mov v21.16b, v14.16b\n" + "mov v22.16b, v14.16b\n" + "fmla v14.4s, v31.4s, v0.s[0]\n" + "fmla v15.4s, v31.4s, v0.s[2]\n" + "fmla v16.4s, v31.4s, v1.s[0]\n" + "fmla v17.4s, v31.4s, v4.s[0]\n" + "fmla v18.4s, v31.4s, v4.s[2]\n" + "fmla v19.4s, v31.4s, v5.s[0]\n" + "fmla v20.4s, v31.4s, v8.s[0]\n" + "fmla v21.4s, v31.4s, v8.s[2]\n" + "fmla v22.4s, v31.4s, v9.s[0]\n" + "ldr q31, [%x[params], #0x40]\n" + "fmla v14.4s, v30.4s, v0.s[1]\n" + "fmla v15.4s, v30.4s, v0.s[3]\n" + "fmla v16.4s, v30.4s, v1.s[1]\n" + "fmla v17.4s, v30.4s, v4.s[1]\n" + "fmla v18.4s, v30.4s, v4.s[3]\n" + "fmla v19.4s, v30.4s, v5.s[1]\n" + "fmla v20.4s, v30.4s, v8.s[1]\n" + "fmla v21.4s, v30.4s, v8.s[3]\n" + "fmla v22.4s, v30.4s, v9.s[1]\n" + "ldr q30, [%x[params], #0x50]\n" + "fmla v14.4s, v29.4s, v0.s[2]\n" + "fmla v15.4s, v29.4s, v1.s[0]\n" + "fmla v16.4s, v29.4s, v1.s[2]\n" + "fmla v17.4s, v29.4s, v4.s[2]\n" + "fmla v18.4s, v29.4s, v5.s[0]\n" + "fmla v19.4s, v29.4s, v5.s[2]\n" + "fmla v20.4s, v29.4s, v8.s[2]\n" + "fmla v21.4s, v29.4s, v9.s[0]\n" + "fmla v22.4s, v29.4s, v9.s[2]\n" + "ldr q29, [%x[params], #0x60]\n" + "fmla v14.4s, v31.4s, v2.s[0]\n" + "fmla v15.4s, v31.4s, v2.s[2]\n" + "fmla v16.4s, v31.4s, v3.s[0]\n" + "fmla v17.4s, v31.4s, v6.s[0]\n" + "fmla v18.4s, v31.4s, v6.s[2]\n" + "fmla v19.4s, v31.4s, v7.s[0]\n" + "fmla v20.4s, v31.4s, v10.s[0]\n" + "fmla v21.4s, v31.4s, v10.s[2]\n" + "fmla v22.4s, v31.4s, v11.s[0]\n" + "ldr q31, [%x[params], #0x70]\n" + "fmla v14.4s, v30.4s, v2.s[1]\n" + "fmla v15.4s, v30.4s, v2.s[3]\n" + "fmla v16.4s, v30.4s, v3.s[1]\n" + "fmla v17.4s, v30.4s, v6.s[1]\n" + "fmla v18.4s, v30.4s, v6.s[3]\n" + "fmla v19.4s, v30.4s, v7.s[1]\n" + "fmla v20.4s, v30.4s, v10.s[1]\n" + "fmla v21.4s, v30.4s, v10.s[3]\n" + "fmla v22.4s, v30.4s, v11.s[1]\n" + "ldr q30, [%x[params], #0x80]\n" + "fmla v14.4s, v29.4s, v2.s[2]\n" + "fmla v15.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[2]\n" + "fmla v17.4s, v29.4s, v6.s[2]\n" + "fmla v18.4s, v29.4s, v7.s[0]\n" + "fmla v19.4s, v29.4s, v7.s[2]\n" + "fmla v20.4s, v29.4s, v10.s[2]\n" + "fmla v21.4s, v29.4s, v11.s[0]\n" + "fmla v22.4s, v29.4s, v11.s[2]\n" + "ldr q29, [%x[params], #0x90]\n" + "add %x[params], %x[params], #0xa0\n" + "fmla v14.4s, v31.4s, v4.s[0]\n" + "fmla v15.4s, v31.4s, v4.s[2]\n" + "fmla v16.4s, v31.4s, v5.s[0]\n" + "fmla v17.4s, v31.4s, v8.s[0]\n" + "fmla v18.4s, v31.4s, v8.s[2]\n" + "fmla v19.4s, v31.4s, v9.s[0]\n" + "fmla v20.4s, v31.4s, v12.s[0]\n" + "fmla v21.4s, v31.4s, v12.s[2]\n" + "fmla v22.4s, v31.4s, v13.s[0]\n" + "fmla v14.4s, v30.4s, v4.s[1]\n" + "fmla v15.4s, v30.4s, v4.s[3]\n" + "fmla v16.4s, v30.4s, v5.s[1]\n" + "fmla v17.4s, v30.4s, v8.s[1]\n" + "fmla v18.4s, v30.4s, v8.s[3]\n" + "fmla v19.4s, v30.4s, v9.s[1]\n" + "fmla v20.4s, v30.4s, v12.s[1]\n" + "fmla v21.4s, v30.4s, v12.s[3]\n" + "fmla v22.4s, v30.4s, v13.s[1]\n" + "fmla v14.4s, v29.4s, v4.s[2]\n" + "fmla v15.4s, v29.4s, v5.s[0]\n" + "fmla v16.4s, v29.4s, v5.s[2]\n" + "fmla v17.4s, v29.4s, v8.s[2]\n" + "fmla v18.4s, v29.4s, v9.s[0]\n" + "fmla v19.4s, v29.4s, v9.s[2]\n" + "fmla v20.4s, v29.4s, v12.s[2]\n" + "fmla v21.4s, v29.4s, v13.s[0]\n" + "fmla v22.4s, v29.4s, v13.s[2]\n" + "fmin v14.4s, v14.4s, v23.4s\n" + "fmin v15.4s, v15.4s, v23.4s\n" + "fmin v16.4s, v16.4s, v23.4s\n" + "fmax v14.4s, v14.4s, v24.4s\n" + "fmax v15.4s, v15.4s, v24.4s\n" + "fmax v16.4s, v16.4s, v24.4s\n" + "fmin v17.4s, v17.4s, v23.4s\n" + "fmin v18.4s, v18.4s, v23.4s\n" + "fmin v19.4s, v19.4s, v23.4s\n" + "fmax v17.4s, v17.4s, v24.4s\n" + "fmax v18.4s, v18.4s, v24.4s\n" + "fmax v19.4s, v19.4s, v24.4s\n" + "fmin v20.4s, v20.4s, v23.4s\n" + "fmin v21.4s, v21.4s, v23.4s\n" + "fmin v22.4s, v22.4s, v23.4s\n" + "fmax v20.4s, v20.4s, v24.4s\n" + "fmax v21.4s, v21.4s, v24.4s\n" + "fmax v22.4s, v22.4s, v24.4s\n" + "tbz %x[channel_multiplier], #1, 4f\n" + "add x19, x14, x26\n" + "st1 { v14.d }[0], [x19]\n" + "add x19, x13, x26\n" + "st1 { v15.d }[0], [x19]\n" + "add x19, x11, x26\n" + "st1 { v16.d }[0], [x19]\n" + "add x19, x10, x26\n" + "st1 { v17.d }[0], [x19]\n" + "add x19, x28, x26\n" + "st1 { v18.d }[0], [x19]\n" + "add x19, x27, x26\n" + "st1 { v19.d }[0], [x19]\n" + "add x19, x25, x26\n" + "st1 { v20.d }[0], [x19]\n" + "add x19, x24, x26\n" + "st1 { v21.d }[0], [x19]\n" + "add x19, x22, x26\n" + "st1 { v22.d }[0], [x19]\n" + "add x26, x26, #0x8\n" + "tbz %x[channel_multiplier], #0, 5f\n" + "add x19, x14, x26\n" + "st1 { v14.s }[2], [x19]\n" + "add x19, x13, x26\n" + "st1 { v15.s }[2], [x19]\n" + "add x19, x11, x26\n" + "st1 { v16.s }[2], [x19]\n" + "add x19, x10, x26\n" + "st1 { v17.s }[2], [x19]\n" + "add x19, x28, x26\n" + "st1 { v18.s }[2], [x19]\n" + "add x19, x27, x26\n" + "st1 { v19.s }[2], [x19]\n" + "add x19, x25, x26\n" + "st1 { v20.s }[2], [x19]\n" + "add x19, x24, x26\n" + "st1 { v21.s }[2], [x19]\n" + "add x19, x22, x26\n" + "st1 { v22.s }[2], [x19]\n" + "b 5f\n" + "4:" // Output channel oddments: Store: Bit 1: Unset + "tbz %x[channel_multiplier], #0, 5f\n" + "add x19, x14, x26\n" + "st1 { v14.s }[0], [x19]\n" + "add x19, x13, x26\n" + "st1 { v15.s }[0], [x19]\n" + "add x19, x11, x26\n" + "st1 { v16.s }[0], [x19]\n" + "add x19, x10, x26\n" + "st1 { v17.s }[0], [x19]\n" + "add x19, x28, x26\n" + "st1 { v18.s }[0], [x19]\n" + "add x19, x27, x26\n" + "st1 { v19.s }[0], [x19]\n" + "add x19, x25, x26\n" + "st1 { v20.s }[0], [x19]\n" + "add x19, x24, x26\n" + "st1 { v21.s }[0], [x19]\n" + "add x19, x22, x26\n" + "st1 { v22.s }[0], [x19]\n" + "5:" // Output channel oddments: Store: Bit 1: End + + "6:" // End + + : [params] "+&r" (params) + : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp new file mode 100644 index 0000000000..92d6a757f2 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + +struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 8; + constexpr static unsigned int input_col_quads = 2; + + kern_type kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl; + + a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..6e9e97fa29 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const void *params, + const unsigned int n_output_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ldp x13, x12, [%x[outptrs], #0x0]\n" + "add x11, %x[clamps], #0x4\n" + "ldp x10, x9, [%x[outptrs], #0x10]\n" + "mov x28, #0x0\n" + "ldp x27, x26, [%x[outptrs], #0x20]\n" + "mov x25, #0x0\n" + "ldp x24, x23, [%x[outptrs], #0x30]\n" + "lsr x22, %x[channel_multiplier], #0x2\n" + "ldr x21, [%x[inptrs], #0x0]\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "ldr q0, [x21, #0x0]\n" + "ldr q1, [x21, #0x10]\n" + "ldr q2, [x20, #0x0]\n" + "ldr q3, [x20, #0x10]\n" + "ldr q4, [x19, #0x0]\n" + "ldr q5, [x19, #0x10]\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "ldr x19, [%x[inptrs], #0x28]\n" + "ldr q6, [x21, #0x0]\n" + "ldr q7, [x21, #0x10]\n" + "ldr q8, [x20, #0x0]\n" + "ldr q9, [x20, #0x10]\n" + "ldr q10, [x19, #0x0]\n" + "ldr q11, [x19, #0x10]\n" + "ld1r { v21.4s }, [%x[clamps]]\n" + "ld1r { v20.4s }, [x11]\n" + "cbz x22, 3f\n" + "ldr q12, [%x[params], #0x0]\n" + "mov v13.16b, v12.16b\n" + "ldr q31, [%x[params], #0x10]\n" + "subs x22, x22, #0x1\n" + "mov v14.16b, v12.16b\n" + "ldr q30, [%x[params], #0x20]\n" + "mov v15.16b, v12.16b\n" + "ldr q29, [%x[params], #0x30]\n" + "mov v16.16b, v12.16b\n" + "ldr q28, [%x[params], #0x40]\n" + "mov v17.16b, v12.16b\n" + "ldr q27, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v12.16b\n" + "beq 2f\n" + "1:" // Output channel complete vector loop + "fmla v12.4s, v31.4s, v0.s[0]\n" + "add x28, x28, #0x4\n" + "fmla v13.4s, v31.4s, v0.s[1]\n" + "subs x22, x22, #0x1\n" + "fmla v14.4s, v31.4s, v0.s[2]\n" + "fmla v15.4s, v31.4s, v0.s[3]\n" + "fmla v16.4s, v31.4s, v2.s[0]\n" + "fmla v17.4s, v31.4s, v2.s[1]\n" + "fmla v18.4s, v31.4s, v2.s[2]\n" + "fmla v19.4s, v31.4s, v2.s[3]\n" + "ldr q31, [%x[params], #0x0]\n" + "fmla v12.4s, v30.4s, v0.s[1]\n" + "fmla v13.4s, v30.4s, v0.s[2]\n" + "fmla v14.4s, v30.4s, v0.s[3]\n" + "fmla v15.4s, v30.4s, v1.s[0]\n" + "fmla v16.4s, v30.4s, v2.s[1]\n" + "fmla v17.4s, v30.4s, v2.s[2]\n" + "fmla v18.4s, v30.4s, v2.s[3]\n" + "fmla v19.4s, v30.4s, v3.s[0]\n" + "ldr q30, [%x[params], #0x10]\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v13.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v15.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v17.4s, v29.4s, v2.s[3]\n" + "fmla v18.4s, v29.4s, v3.s[0]\n" + "fmla v19.4s, v29.4s, v3.s[1]\n" + "ldr q29, [%x[params], #0x20]\n" + "fmla v12.4s, v28.4s, v0.s[3]\n" + "fmla v13.4s, v28.4s, v1.s[0]\n" + "fmla v14.4s, v28.4s, v1.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v16.4s, v28.4s, v2.s[3]\n" + "fmla v17.4s, v28.4s, v3.s[0]\n" + "fmla v18.4s, v28.4s, v3.s[1]\n" + "fmla v19.4s, v28.4s, v3.s[2]\n" + "ldr q28, [%x[params], #0x30]\n" + "fmla v12.4s, v27.4s, v1.s[0]\n" + "fmla v13.4s, v27.4s, v1.s[1]\n" + "fmla v14.4s, v27.4s, v1.s[2]\n" + "fmla v15.4s, v27.4s, v1.s[3]\n" + "fmla v16.4s, v27.4s, v3.s[0]\n" + "fmla v17.4s, v27.4s, v3.s[1]\n" + "fmla v18.4s, v27.4s, v3.s[2]\n" + "fmla v19.4s, v27.4s, v3.s[3]\n" + "ldr q27, [%x[params], #0x40]\n" + "fmla v12.4s, v31.4s, v2.s[0]\n" + "fmla v13.4s, v31.4s, v2.s[1]\n" + "fmla v14.4s, v31.4s, v2.s[2]\n" + "fmla v15.4s, v31.4s, v2.s[3]\n" + "fmla v16.4s, v31.4s, v4.s[0]\n" + "fmla v17.4s, v31.4s, v4.s[1]\n" + "fmla v18.4s, v31.4s, v4.s[2]\n" + "fmla v19.4s, v31.4s, v4.s[3]\n" + "ldr q31, [%x[params], #0x50]\n" + "fmla v12.4s, v30.4s, v2.s[1]\n" + "fmla v13.4s, v30.4s, v2.s[2]\n" + "fmla v14.4s, v30.4s, v2.s[3]\n" + "fmla v15.4s, v30.4s, v3.s[0]\n" + "fmla v16.4s, v30.4s, v4.s[1]\n" + "fmla v17.4s, v30.4s, v4.s[2]\n" + "fmla v18.4s, v30.4s, v4.s[3]\n" + "fmla v19.4s, v30.4s, v5.s[0]\n" + "ldr q30, [%x[params], #0x60]\n" + "fmla v12.4s, v29.4s, v2.s[2]\n" + "fmla v13.4s, v29.4s, v2.s[3]\n" + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v15.4s, v29.4s, v3.s[1]\n" + "fmla v16.4s, v29.4s, v4.s[2]\n" + "fmla v17.4s, v29.4s, v4.s[3]\n" + "fmla v18.4s, v29.4s, v5.s[0]\n" + "fmla v19.4s, v29.4s, v5.s[1]\n" + "ldr q29, [%x[params], #0x70]\n" + "fmla v12.4s, v28.4s, v2.s[3]\n" + "fmla v13.4s, v28.4s, v3.s[0]\n" + "fmla v14.4s, v28.4s, v3.s[1]\n" + "fmla v15.4s, v28.4s, v3.s[2]\n" + "fmla v16.4s, v28.4s, v4.s[3]\n" + "fmla v17.4s, v28.4s, v5.s[0]\n" + "fmla v18.4s, v28.4s, v5.s[1]\n" + "fmla v19.4s, v28.4s, v5.s[2]\n" + "ldr q28, [%x[params], #0x80]\n" + "fmla v12.4s, v27.4s, v3.s[0]\n" + "fmla v13.4s, v27.4s, v3.s[1]\n" + "fmla v14.4s, v27.4s, v3.s[2]\n" + "fmla v15.4s, v27.4s, v3.s[3]\n" + "fmla v16.4s, v27.4s, v5.s[0]\n" + "fmla v17.4s, v27.4s, v5.s[1]\n" + "fmla v18.4s, v27.4s, v5.s[2]\n" + "fmla v19.4s, v27.4s, v5.s[3]\n" + "ldr q27, [%x[params], #0x90]\n" + "fmla v12.4s, v31.4s, v4.s[0]\n" + "fmla v13.4s, v31.4s, v4.s[1]\n" + "fmla v14.4s, v31.4s, v4.s[2]\n" + "fmla v15.4s, v31.4s, v4.s[3]\n" + "fmla v16.4s, v31.4s, v6.s[0]\n" + "fmla v17.4s, v31.4s, v6.s[1]\n" + "fmla v18.4s, v31.4s, v6.s[2]\n" + "fmla v19.4s, v31.4s, v6.s[3]\n" + "ldr q31, [%x[params], #0xa0]\n" + "fmla v12.4s, v30.4s, v4.s[1]\n" + "fmla v13.4s, v30.4s, v4.s[2]\n" + "fmla v14.4s, v30.4s, v4.s[3]\n" + "fmla v15.4s, v30.4s, v5.s[0]\n" + "fmla v16.4s, v30.4s, v6.s[1]\n" + "fmla v17.4s, v30.4s, v6.s[2]\n" + "fmla v18.4s, v30.4s, v6.s[3]\n" + "fmla v19.4s, v30.4s, v7.s[0]\n" + "ldr q30, [%x[params], #0xb0]\n" + "fmla v12.4s, v29.4s, v4.s[2]\n" + "fmla v13.4s, v29.4s, v4.s[3]\n" + "fmla v14.4s, v29.4s, v5.s[0]\n" + "fmla v15.4s, v29.4s, v5.s[1]\n" + "fmla v16.4s, v29.4s, v6.s[2]\n" + "fmla v17.4s, v29.4s, v6.s[3]\n" + "fmla v18.4s, v29.4s, v7.s[0]\n" + "fmla v19.4s, v29.4s, v7.s[1]\n" + "ldr q29, [%x[params], #0xc0]\n" + "fmla v12.4s, v28.4s, v4.s[3]\n" + "fmla v13.4s, v28.4s, v5.s[0]\n" + "fmla v14.4s, v28.4s, v5.s[1]\n" + "fmla v15.4s, v28.4s, v5.s[2]\n" + "fmla v16.4s, v28.4s, v6.s[3]\n" + "fmla v17.4s, v28.4s, v7.s[0]\n" + "fmla v18.4s, v28.4s, v7.s[1]\n" + "fmla v19.4s, v28.4s, v7.s[2]\n" + "ldr q28, [%x[params], #0xd0]\n" + "fmla v12.4s, v27.4s, v5.s[0]\n" + "fmla v13.4s, v27.4s, v5.s[1]\n" + "fmla v14.4s, v27.4s, v5.s[2]\n" + "fmla v15.4s, v27.4s, v5.s[3]\n" + "fmla v16.4s, v27.4s, v7.s[0]\n" + "fmla v17.4s, v27.4s, v7.s[1]\n" + "fmla v18.4s, v27.4s, v7.s[2]\n" + "fmla v19.4s, v27.4s, v7.s[3]\n" + "ldr q27, [%x[params], #0xe0]\n" + "fmla v12.4s, v31.4s, v6.s[0]\n" + "fmla v13.4s, v31.4s, v6.s[1]\n" + "fmla v14.4s, v31.4s, v6.s[2]\n" + "fmla v15.4s, v31.4s, v6.s[3]\n" + "fmla v16.4s, v31.4s, v8.s[0]\n" + "fmla v17.4s, v31.4s, v8.s[1]\n" + "fmla v18.4s, v31.4s, v8.s[2]\n" + "fmla v19.4s, v31.4s, v8.s[3]\n" + "ldr q31, [%x[params], #0xf0]\n" + "fmla v12.4s, v30.4s, v6.s[1]\n" + "fmla v13.4s, v30.4s, v6.s[2]\n" + "fmla v14.4s, v30.4s, v6.s[3]\n" + "fmla v15.4s, v30.4s, v7.s[0]\n" + "fmla v16.4s, v30.4s, v8.s[1]\n" + "fmla v17.4s, v30.4s, v8.s[2]\n" + "fmla v18.4s, v30.4s, v8.s[3]\n" + "fmla v19.4s, v30.4s, v9.s[0]\n" + "ldr q30, [%x[params], #0x100]\n" + "fmla v12.4s, v29.4s, v6.s[2]\n" + "fmla v13.4s, v29.4s, v6.s[3]\n" + "fmla v14.4s, v29.4s, v7.s[0]\n" + "fmla v15.4s, v29.4s, v7.s[1]\n" + "fmla v16.4s, v29.4s, v8.s[2]\n" + "fmla v17.4s, v29.4s, v8.s[3]\n" + "fmla v18.4s, v29.4s, v9.s[0]\n" + "fmla v19.4s, v29.4s, v9.s[1]\n" + "ldr q29, [%x[params], #0x110]\n" + "fmla v12.4s, v28.4s, v6.s[3]\n" + "fmla v13.4s, v28.4s, v7.s[0]\n" + "fmla v14.4s, v28.4s, v7.s[1]\n" + "fmla v15.4s, v28.4s, v7.s[2]\n" + "fmla v16.4s, v28.4s, v8.s[3]\n" + "fmla v17.4s, v28.4s, v9.s[0]\n" + "fmla v18.4s, v28.4s, v9.s[1]\n" + "fmla v19.4s, v28.4s, v9.s[2]\n" + "ldr q28, [%x[params], #0x120]\n" + "fmla v12.4s, v27.4s, v7.s[0]\n" + "fmla v13.4s, v27.4s, v7.s[1]\n" + "fmla v14.4s, v27.4s, v7.s[2]\n" + "fmla v15.4s, v27.4s, v7.s[3]\n" + "fmla v16.4s, v27.4s, v9.s[0]\n" + "fmla v17.4s, v27.4s, v9.s[1]\n" + "fmla v18.4s, v27.4s, v9.s[2]\n" + "fmla v19.4s, v27.4s, v9.s[3]\n" + "ldr q27, [%x[params], #0x130]\n" + "fmla v12.4s, v31.4s, v8.s[0]\n" + "fmla v13.4s, v31.4s, v8.s[1]\n" + "fmla v14.4s, v31.4s, v8.s[2]\n" + "fmla v15.4s, v31.4s, v8.s[3]\n" + "fmla v16.4s, v31.4s, v10.s[0]\n" + "fmla v17.4s, v31.4s, v10.s[1]\n" + "fmla v18.4s, v31.4s, v10.s[2]\n" + "fmla v19.4s, v31.4s, v10.s[3]\n" + "ldr q31, [%x[params], #0x150]\n" + "fmla v12.4s, v30.4s, v8.s[1]\n" + "fmla v13.4s, v30.4s, v8.s[2]\n" + "fmla v14.4s, v30.4s, v8.s[3]\n" + "fmla v15.4s, v30.4s, v9.s[0]\n" + "fmla v16.4s, v30.4s, v10.s[1]\n" + "fmla v17.4s, v30.4s, v10.s[2]\n" + "fmla v18.4s, v30.4s, v10.s[3]\n" + "fmla v19.4s, v30.4s, v11.s[0]\n" + "ldr q30, [%x[params], #0x160]\n" + "fmla v12.4s, v29.4s, v8.s[2]\n" + "fmla v13.4s, v29.4s, v8.s[3]\n" + "fmla v14.4s, v29.4s, v9.s[0]\n" + "fmla v15.4s, v29.4s, v9.s[1]\n" + "fmla v16.4s, v29.4s, v10.s[2]\n" + "fmla v17.4s, v29.4s, v10.s[3]\n" + "fmla v18.4s, v29.4s, v11.s[0]\n" + "fmla v19.4s, v29.4s, v11.s[1]\n" + "ldr q29, [%x[params], #0x170]\n" + "fmla v12.4s, v28.4s, v8.s[3]\n" + "fmla v13.4s, v28.4s, v9.s[0]\n" + "fmla v14.4s, v28.4s, v9.s[1]\n" + "fmla v15.4s, v28.4s, v9.s[2]\n" + "fmla v16.4s, v28.4s, v10.s[3]\n" + "fmla v17.4s, v28.4s, v11.s[0]\n" + "fmla v18.4s, v28.4s, v11.s[1]\n" + "fmla v19.4s, v28.4s, v11.s[2]\n" + "ldr q28, [%x[params], #0x180]\n" + "fmla v12.4s, v27.4s, v9.s[0]\n" + "fmla v13.4s, v27.4s, v9.s[1]\n" + "fmla v14.4s, v27.4s, v9.s[2]\n" + "fmla v15.4s, v27.4s, v9.s[3]\n" + "fmla v16.4s, v27.4s, v11.s[0]\n" + "fmla v17.4s, v27.4s, v11.s[1]\n" + "fmla v18.4s, v27.4s, v11.s[2]\n" + "fmla v19.4s, v27.4s, v11.s[3]\n" + "ldr q27, [%x[params], #0x190]\n" + "fmin v12.4s, v12.4s, v20.4s\n" + "fmin v13.4s, v13.4s, v20.4s\n" + "fmin v14.4s, v14.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "str q12, [x13, x25]\n" + "fmax v13.4s, v13.4s, v21.4s\n" + "fmax v14.4s, v14.4s, v21.4s\n" + "ldr q12, [%x[params], #0x140]\n" + "add %x[params], %x[params], #0x1a0\n" + "fmin v15.4s, v15.4s, v20.4s\n" + "str q13, [x12, x25]\n" + "fmin v16.4s, v16.4s, v20.4s\n" + "fmin v17.4s, v17.4s, v20.4s\n" + "str q14, [x10, x25]\n" + "fmin v18.4s, v18.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v21.4s\n" + "str q15, [x9, x25]\n" + "fmax v16.4s, v16.4s, v21.4s\n" + "fmax v17.4s, v17.4s, v21.4s\n" + "str q16, [x27, x25]\n" + "fmax v18.4s, v18.4s, v21.4s\n" + "fmin v19.4s, v19.4s, v20.4s\n" + "str q17, [x26, x25]\n" + "mov v13.16b, v12.16b\n" + "str q18, [x24, x25]\n" + "fmax v19.4s, v19.4s, v21.4s\n" + "mov v14.16b, v12.16b\n" + "str q19, [x23, x25]\n" + "mov v15.16b, v12.16b\n" + "add x25, x25, #0x10\n" + "mov v16.16b, v12.16b\n" + "mov v17.16b, v12.16b\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v12.16b\n" + "bgt 1b\n" + "2:" // Output channel complete vector tail + "fmla v12.4s, v31.4s, v0.s[0]\n" + "fmla v13.4s, v31.4s, v0.s[1]\n" + "fmla v14.4s, v31.4s, v0.s[2]\n" + "fmla v15.4s, v31.4s, v0.s[3]\n" + "fmla v16.4s, v31.4s, v2.s[0]\n" + "fmla v17.4s, v31.4s, v2.s[1]\n" + "fmla v18.4s, v31.4s, v2.s[2]\n" + "fmla v19.4s, v31.4s, v2.s[3]\n" + "ldr q31, [%x[params], #0x0]\n" + "fmla v12.4s, v30.4s, v0.s[1]\n" + "fmla v13.4s, v30.4s, v0.s[2]\n" + "fmla v14.4s, v30.4s, v0.s[3]\n" + "fmla v15.4s, v30.4s, v1.s[0]\n" + "fmla v16.4s, v30.4s, v2.s[1]\n" + "fmla v17.4s, v30.4s, v2.s[2]\n" + "fmla v18.4s, v30.4s, v2.s[3]\n" + "fmla v19.4s, v30.4s, v3.s[0]\n" + "ldr q30, [%x[params], #0x10]\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v13.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v15.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v17.4s, v29.4s, v2.s[3]\n" + "fmla v18.4s, v29.4s, v3.s[0]\n" + "fmla v19.4s, v29.4s, v3.s[1]\n" + "ldr q29, [%x[params], #0x20]\n" + "fmla v12.4s, v28.4s, v0.s[3]\n" + "fmla v13.4s, v28.4s, v1.s[0]\n" + "fmla v14.4s, v28.4s, v1.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v16.4s, v28.4s, v2.s[3]\n" + "fmla v17.4s, v28.4s, v3.s[0]\n" + "fmla v18.4s, v28.4s, v3.s[1]\n" + "fmla v19.4s, v28.4s, v3.s[2]\n" + "ldr q28, [%x[params], #0x30]\n" + "fmla v12.4s, v27.4s, v1.s[0]\n" + "fmla v13.4s, v27.4s, v1.s[1]\n" + "fmla v14.4s, v27.4s, v1.s[2]\n" + "fmla v15.4s, v27.4s, v1.s[3]\n" + "fmla v16.4s, v27.4s, v3.s[0]\n" + "fmla v17.4s, v27.4s, v3.s[1]\n" + "fmla v18.4s, v27.4s, v3.s[2]\n" + "fmla v19.4s, v27.4s, v3.s[3]\n" + "ldr q27, [%x[params], #0x40]\n" + "fmla v12.4s, v31.4s, v2.s[0]\n" + "fmla v13.4s, v31.4s, v2.s[1]\n" + "fmla v14.4s, v31.4s, v2.s[2]\n" + "fmla v15.4s, v31.4s, v2.s[3]\n" + "fmla v16.4s, v31.4s, v4.s[0]\n" + "fmla v17.4s, v31.4s, v4.s[1]\n" + "fmla v18.4s, v31.4s, v4.s[2]\n" + "fmla v19.4s, v31.4s, v4.s[3]\n" + "ldr q31, [%x[params], #0x50]\n" + "fmla v12.4s, v30.4s, v2.s[1]\n" + "fmla v13.4s, v30.4s, v2.s[2]\n" + "fmla v14.4s, v30.4s, v2.s[3]\n" + "fmla v15.4s, v30.4s, v3.s[0]\n" + "fmla v16.4s, v30.4s, v4.s[1]\n" + "fmla v17.4s, v30.4s, v4.s[2]\n" + "fmla v18.4s, v30.4s, v4.s[3]\n" + "fmla v19.4s, v30.4s, v5.s[0]\n" + "ldr q30, [%x[params], #0x60]\n" + "fmla v12.4s, v29.4s, v2.s[2]\n" + "fmla v13.4s, v29.4s, v2.s[3]\n" + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v15.4s, v29.4s, v3.s[1]\n" + "fmla v16.4s, v29.4s, v4.s[2]\n" + "fmla v17.4s, v29.4s, v4.s[3]\n" + "fmla v18.4s, v29.4s, v5.s[0]\n" + "fmla v19.4s, v29.4s, v5.s[1]\n" + "ldr q29, [%x[params], #0x70]\n" + "fmla v12.4s, v28.4s, v2.s[3]\n" + "fmla v13.4s, v28.4s, v3.s[0]\n" + "fmla v14.4s, v28.4s, v3.s[1]\n" + "fmla v15.4s, v28.4s, v3.s[2]\n" + "fmla v16.4s, v28.4s, v4.s[3]\n" + "fmla v17.4s, v28.4s, v5.s[0]\n" + "fmla v18.4s, v28.4s, v5.s[1]\n" + "fmla v19.4s, v28.4s, v5.s[2]\n" + "ldr q28, [%x[params], #0x80]\n" + "fmla v12.4s, v27.4s, v3.s[0]\n" + "fmla v13.4s, v27.4s, v3.s[1]\n" + "fmla v14.4s, v27.4s, v3.s[2]\n" + "fmla v15.4s, v27.4s, v3.s[3]\n" + "fmla v16.4s, v27.4s, v5.s[0]\n" + "fmla v17.4s, v27.4s, v5.s[1]\n" + "fmla v18.4s, v27.4s, v5.s[2]\n" + "fmla v19.4s, v27.4s, v5.s[3]\n" + "ldr q27, [%x[params], #0x90]\n" + "fmla v12.4s, v31.4s, v4.s[0]\n" + "fmla v13.4s, v31.4s, v4.s[1]\n" + "fmla v14.4s, v31.4s, v4.s[2]\n" + "fmla v15.4s, v31.4s, v4.s[3]\n" + "fmla v16.4s, v31.4s, v6.s[0]\n" + "fmla v17.4s, v31.4s, v6.s[1]\n" + "fmla v18.4s, v31.4s, v6.s[2]\n" + "fmla v19.4s, v31.4s, v6.s[3]\n" + "ldr q31, [%x[params], #0xa0]\n" + "fmla v12.4s, v30.4s, v4.s[1]\n" + "fmla v13.4s, v30.4s, v4.s[2]\n" + "fmla v14.4s, v30.4s, v4.s[3]\n" + "fmla v15.4s, v30.4s, v5.s[0]\n" + "fmla v16.4s, v30.4s, v6.s[1]\n" + "fmla v17.4s, v30.4s, v6.s[2]\n" + "fmla v18.4s, v30.4s, v6.s[3]\n" + "fmla v19.4s, v30.4s, v7.s[0]\n" + "ldr q30, [%x[params], #0xb0]\n" + "fmla v12.4s, v29.4s, v4.s[2]\n" + "fmla v13.4s, v29.4s, v4.s[3]\n" + "fmla v14.4s, v29.4s, v5.s[0]\n" + "fmla v15.4s, v29.4s, v5.s[1]\n" + "fmla v16.4s, v29.4s, v6.s[2]\n" + "fmla v17.4s, v29.4s, v6.s[3]\n" + "fmla v18.4s, v29.4s, v7.s[0]\n" + "fmla v19.4s, v29.4s, v7.s[1]\n" + "ldr q29, [%x[params], #0xc0]\n" + "fmla v12.4s, v28.4s, v4.s[3]\n" + "fmla v13.4s, v28.4s, v5.s[0]\n" + "fmla v14.4s, v28.4s, v5.s[1]\n" + "fmla v15.4s, v28.4s, v5.s[2]\n" + "fmla v16.4s, v28.4s, v6.s[3]\n" + "fmla v17.4s, v28.4s, v7.s[0]\n" + "fmla v18.4s, v28.4s, v7.s[1]\n" + "fmla v19.4s, v28.4s, v7.s[2]\n" + "ldr q28, [%x[params], #0xd0]\n" + "fmla v12.4s, v27.4s, v5.s[0]\n" + "fmla v13.4s, v27.4s, v5.s[1]\n" + "fmla v14.4s, v27.4s, v5.s[2]\n" + "fmla v15.4s, v27.4s, v5.s[3]\n" + "fmla v16.4s, v27.4s, v7.s[0]\n" + "fmla v17.4s, v27.4s, v7.s[1]\n" + "fmla v18.4s, v27.4s, v7.s[2]\n" + "fmla v19.4s, v27.4s, v7.s[3]\n" + "ldr q27, [%x[params], #0xe0]\n" + "fmla v12.4s, v31.4s, v6.s[0]\n" + "fmla v13.4s, v31.4s, v6.s[1]\n" + "fmla v14.4s, v31.4s, v6.s[2]\n" + "fmla v15.4s, v31.4s, v6.s[3]\n" + "fmla v16.4s, v31.4s, v8.s[0]\n" + "fmla v17.4s, v31.4s, v8.s[1]\n" + "fmla v18.4s, v31.4s, v8.s[2]\n" + "fmla v19.4s, v31.4s, v8.s[3]\n" + "ldr q31, [%x[params], #0xf0]\n" + "fmla v12.4s, v30.4s, v6.s[1]\n" + "fmla v13.4s, v30.4s, v6.s[2]\n" + "fmla v14.4s, v30.4s, v6.s[3]\n" + "fmla v15.4s, v30.4s, v7.s[0]\n" + "fmla v16.4s, v30.4s, v8.s[1]\n" + "fmla v17.4s, v30.4s, v8.s[2]\n" + "fmla v18.4s, v30.4s, v8.s[3]\n" + "fmla v19.4s, v30.4s, v9.s[0]\n" + "ldr q30, [%x[params], #0x100]\n" + "fmla v12.4s, v29.4s, v6.s[2]\n" + "fmla v13.4s, v29.4s, v6.s[3]\n" + "fmla v14.4s, v29.4s, v7.s[0]\n" + "fmla v15.4s, v29.4s, v7.s[1]\n" + "fmla v16.4s, v29.4s, v8.s[2]\n" + "fmla v17.4s, v29.4s, v8.s[3]\n" + "fmla v18.4s, v29.4s, v9.s[0]\n" + "fmla v19.4s, v29.4s, v9.s[1]\n" + "ldr q29, [%x[params], #0x110]\n" + "fmla v12.4s, v28.4s, v6.s[3]\n" + "fmla v13.4s, v28.4s, v7.s[0]\n" + "fmla v14.4s, v28.4s, v7.s[1]\n" + "fmla v15.4s, v28.4s, v7.s[2]\n" + "fmla v16.4s, v28.4s, v8.s[3]\n" + "fmla v17.4s, v28.4s, v9.s[0]\n" + "fmla v18.4s, v28.4s, v9.s[1]\n" + "fmla v19.4s, v28.4s, v9.s[2]\n" + "ldr q28, [%x[params], #0x120]\n" + "fmla v12.4s, v27.4s, v7.s[0]\n" + "fmla v13.4s, v27.4s, v7.s[1]\n" + "fmla v14.4s, v27.4s, v7.s[2]\n" + "fmla v15.4s, v27.4s, v7.s[3]\n" + "fmla v16.4s, v27.4s, v9.s[0]\n" + "fmla v17.4s, v27.4s, v9.s[1]\n" + "fmla v18.4s, v27.4s, v9.s[2]\n" + "fmla v19.4s, v27.4s, v9.s[3]\n" + "ldr q27, [%x[params], #0x130]\n" + "add %x[params], %x[params], #0x140\n" + "fmla v12.4s, v31.4s, v8.s[0]\n" + "fmla v13.4s, v31.4s, v8.s[1]\n" + "fmla v14.4s, v31.4s, v8.s[2]\n" + "fmla v15.4s, v31.4s, v8.s[3]\n" + "fmla v16.4s, v31.4s, v10.s[0]\n" + "fmla v17.4s, v31.4s, v10.s[1]\n" + "fmla v18.4s, v31.4s, v10.s[2]\n" + "fmla v19.4s, v31.4s, v10.s[3]\n" + "fmla v12.4s, v30.4s, v8.s[1]\n" + "fmla v13.4s, v30.4s, v8.s[2]\n" + "fmla v14.4s, v30.4s, v8.s[3]\n" + "fmla v15.4s, v30.4s, v9.s[0]\n" + "fmla v16.4s, v30.4s, v10.s[1]\n" + "fmla v17.4s, v30.4s, v10.s[2]\n" + "fmla v18.4s, v30.4s, v10.s[3]\n" + "fmla v19.4s, v30.4s, v11.s[0]\n" + "fmla v12.4s, v29.4s, v8.s[2]\n" + "fmla v13.4s, v29.4s, v8.s[3]\n" + "fmla v14.4s, v29.4s, v9.s[0]\n" + "fmla v15.4s, v29.4s, v9.s[1]\n" + "fmla v16.4s, v29.4s, v10.s[2]\n" + "fmla v17.4s, v29.4s, v10.s[3]\n" + "fmla v18.4s, v29.4s, v11.s[0]\n" + "fmla v19.4s, v29.4s, v11.s[1]\n" + "fmla v12.4s, v28.4s, v8.s[3]\n" + "fmla v13.4s, v28.4s, v9.s[0]\n" + "fmla v14.4s, v28.4s, v9.s[1]\n" + "fmla v15.4s, v28.4s, v9.s[2]\n" + "fmla v16.4s, v28.4s, v10.s[3]\n" + "fmla v17.4s, v28.4s, v11.s[0]\n" + "fmla v18.4s, v28.4s, v11.s[1]\n" + "fmla v19.4s, v28.4s, v11.s[2]\n" + "fmla v12.4s, v27.4s, v9.s[0]\n" + "fmla v13.4s, v27.4s, v9.s[1]\n" + "fmla v14.4s, v27.4s, v9.s[2]\n" + "fmla v15.4s, v27.4s, v9.s[3]\n" + "fmla v16.4s, v27.4s, v11.s[0]\n" + "fmla v17.4s, v27.4s, v11.s[1]\n" + "fmla v18.4s, v27.4s, v11.s[2]\n" + "fmla v19.4s, v27.4s, v11.s[3]\n" + "fmin v12.4s, v12.4s, v20.4s\n" + "fmin v13.4s, v13.4s, v20.4s\n" + "fmin v14.4s, v14.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "str q12, [x13, x25]\n" + "fmax v13.4s, v13.4s, v21.4s\n" + "fmax v14.4s, v14.4s, v21.4s\n" + "str q13, [x12, x25]\n" + "fmin v15.4s, v15.4s, v20.4s\n" + "fmin v16.4s, v16.4s, v20.4s\n" + "str q14, [x10, x25]\n" + "fmin v17.4s, v17.4s, v20.4s\n" + "fmin v18.4s, v18.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v21.4s\n" + "str q15, [x9, x25]\n" + "fmax v16.4s, v16.4s, v21.4s\n" + "fmax v17.4s, v17.4s, v21.4s\n" + "str q16, [x27, x25]\n" + "fmax v18.4s, v18.4s, v21.4s\n" + "fmin v19.4s, v19.4s, v20.4s\n" + "str q17, [x26, x25]\n" + "fmax v19.4s, v19.4s, v21.4s\n" + "str q18, [x24, x25]\n" + "str q19, [x23, x25]\n" + "add x25, x25, #0x10\n" + "3:" // Output channel oddments + "tst %x[channel_multiplier], #0x3\n" + "beq 6f\n" + "ldr q12, [%x[params], #0x0]\n" + "mov v13.16b, v12.16b\n" + "ldr q31, [%x[params], #0x10]\n" + "mov v14.16b, v12.16b\n" + "ldr q30, [%x[params], #0x20]\n" + "mov v15.16b, v12.16b\n" + "ldr q29, [%x[params], #0x30]\n" + "mov v16.16b, v12.16b\n" + "ldr q28, [%x[params], #0x40]\n" + "mov v17.16b, v12.16b\n" + "ldr q27, [%x[params], #0x50]\n" + "mov v18.16b, v12.16b\n" + "mov v19.16b, v12.16b\n" + "fmla v12.4s, v31.4s, v0.s[0]\n" + "fmla v13.4s, v31.4s, v0.s[1]\n" + "fmla v14.4s, v31.4s, v0.s[2]\n" + "fmla v15.4s, v31.4s, v0.s[3]\n" + "fmla v16.4s, v31.4s, v2.s[0]\n" + "fmla v17.4s, v31.4s, v2.s[1]\n" + "fmla v18.4s, v31.4s, v2.s[2]\n" + "fmla v19.4s, v31.4s, v2.s[3]\n" + "ldr q31, [%x[params], #0x60]\n" + "fmla v12.4s, v30.4s, v0.s[1]\n" + "fmla v13.4s, v30.4s, v0.s[2]\n" + "fmla v14.4s, v30.4s, v0.s[3]\n" + "fmla v15.4s, v30.4s, v1.s[0]\n" + "fmla v16.4s, v30.4s, v2.s[1]\n" + "fmla v17.4s, v30.4s, v2.s[2]\n" + "fmla v18.4s, v30.4s, v2.s[3]\n" + "fmla v19.4s, v30.4s, v3.s[0]\n" + "ldr q30, [%x[params], #0x70]\n" + "fmla v12.4s, v29.4s, v0.s[2]\n" + "fmla v13.4s, v29.4s, v0.s[3]\n" + "fmla v14.4s, v29.4s, v1.s[0]\n" + "fmla v15.4s, v29.4s, v1.s[1]\n" + "fmla v16.4s, v29.4s, v2.s[2]\n" + "fmla v17.4s, v29.4s, v2.s[3]\n" + "fmla v18.4s, v29.4s, v3.s[0]\n" + "fmla v19.4s, v29.4s, v3.s[1]\n" + "ldr q29, [%x[params], #0x80]\n" + "fmla v12.4s, v28.4s, v0.s[3]\n" + "fmla v13.4s, v28.4s, v1.s[0]\n" + "fmla v14.4s, v28.4s, v1.s[1]\n" + "fmla v15.4s, v28.4s, v1.s[2]\n" + "fmla v16.4s, v28.4s, v2.s[3]\n" + "fmla v17.4s, v28.4s, v3.s[0]\n" + "fmla v18.4s, v28.4s, v3.s[1]\n" + "fmla v19.4s, v28.4s, v3.s[2]\n" + "ldr q28, [%x[params], #0x90]\n" + "fmla v12.4s, v27.4s, v1.s[0]\n" + "fmla v13.4s, v27.4s, v1.s[1]\n" + "fmla v14.4s, v27.4s, v1.s[2]\n" + "fmla v15.4s, v27.4s, v1.s[3]\n" + "fmla v16.4s, v27.4s, v3.s[0]\n" + "fmla v17.4s, v27.4s, v3.s[1]\n" + "fmla v18.4s, v27.4s, v3.s[2]\n" + "fmla v19.4s, v27.4s, v3.s[3]\n" + "ldr q27, [%x[params], #0xa0]\n" + "fmla v12.4s, v31.4s, v2.s[0]\n" + "fmla v13.4s, v31.4s, v2.s[1]\n" + "fmla v14.4s, v31.4s, v2.s[2]\n" + "fmla v15.4s, v31.4s, v2.s[3]\n" + "fmla v16.4s, v31.4s, v4.s[0]\n" + "fmla v17.4s, v31.4s, v4.s[1]\n" + "fmla v18.4s, v31.4s, v4.s[2]\n" + "fmla v19.4s, v31.4s, v4.s[3]\n" + "ldr q31, [%x[params], #0xb0]\n" + "fmla v12.4s, v30.4s, v2.s[1]\n" + "fmla v13.4s, v30.4s, v2.s[2]\n" + "fmla v14.4s, v30.4s, v2.s[3]\n" + "fmla v15.4s, v30.4s, v3.s[0]\n" + "fmla v16.4s, v30.4s, v4.s[1]\n" + "fmla v17.4s, v30.4s, v4.s[2]\n" + "fmla v18.4s, v30.4s, v4.s[3]\n" + "fmla v19.4s, v30.4s, v5.s[0]\n" + "ldr q30, [%x[params], #0xc0]\n" + "fmla v12.4s, v29.4s, v2.s[2]\n" + "fmla v13.4s, v29.4s, v2.s[3]\n" + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v15.4s, v29.4s, v3.s[1]\n" + "fmla v16.4s, v29.4s, v4.s[2]\n" + "fmla v17.4s, v29.4s, v4.s[3]\n" + "fmla v18.4s, v29.4s, v5.s[0]\n" + "fmla v19.4s, v29.4s, v5.s[1]\n" + "ldr q29, [%x[params], #0xd0]\n" + "fmla v12.4s, v28.4s, v2.s[3]\n" + "fmla v13.4s, v28.4s, v3.s[0]\n" + "fmla v14.4s, v28.4s, v3.s[1]\n" + "fmla v15.4s, v28.4s, v3.s[2]\n" + "fmla v16.4s, v28.4s, v4.s[3]\n" + "fmla v17.4s, v28.4s, v5.s[0]\n" + "fmla v18.4s, v28.4s, v5.s[1]\n" + "fmla v19.4s, v28.4s, v5.s[2]\n" + "ldr q28, [%x[params], #0xe0]\n" + "fmla v12.4s, v27.4s, v3.s[0]\n" + "fmla v13.4s, v27.4s, v3.s[1]\n" + "fmla v14.4s, v27.4s, v3.s[2]\n" + "fmla v15.4s, v27.4s, v3.s[3]\n" + "fmla v16.4s, v27.4s, v5.s[0]\n" + "fmla v17.4s, v27.4s, v5.s[1]\n" + "fmla v18.4s, v27.4s, v5.s[2]\n" + "fmla v19.4s, v27.4s, v5.s[3]\n" + "ldr q27, [%x[params], #0xf0]\n" + "fmla v12.4s, v31.4s, v4.s[0]\n" + "fmla v13.4s, v31.4s, v4.s[1]\n" + "fmla v14.4s, v31.4s, v4.s[2]\n" + "fmla v15.4s, v31.4s, v4.s[3]\n" + "fmla v16.4s, v31.4s, v6.s[0]\n" + "fmla v17.4s, v31.4s, v6.s[1]\n" + "fmla v18.4s, v31.4s, v6.s[2]\n" + "fmla v19.4s, v31.4s, v6.s[3]\n" + "ldr q31, [%x[params], #0x100]\n" + "fmla v12.4s, v30.4s, v4.s[1]\n" + "fmla v13.4s, v30.4s, v4.s[2]\n" + "fmla v14.4s, v30.4s, v4.s[3]\n" + "fmla v15.4s, v30.4s, v5.s[0]\n" + "fmla v16.4s, v30.4s, v6.s[1]\n" + "fmla v17.4s, v30.4s, v6.s[2]\n" + "fmla v18.4s, v30.4s, v6.s[3]\n" + "fmla v19.4s, v30.4s, v7.s[0]\n" + "ldr q30, [%x[params], #0x110]\n" + "fmla v12.4s, v29.4s, v4.s[2]\n" + "fmla v13.4s, v29.4s, v4.s[3]\n" + "fmla v14.4s, v29.4s, v5.s[0]\n" + "fmla v15.4s, v29.4s, v5.s[1]\n" + "fmla v16.4s, v29.4s, v6.s[2]\n" + "fmla v17.4s, v29.4s, v6.s[3]\n" + "fmla v18.4s, v29.4s, v7.s[0]\n" + "fmla v19.4s, v29.4s, v7.s[1]\n" + "ldr q29, [%x[params], #0x120]\n" + "fmla v12.4s, v28.4s, v4.s[3]\n" + "fmla v13.4s, v28.4s, v5.s[0]\n" + "fmla v14.4s, v28.4s, v5.s[1]\n" + "fmla v15.4s, v28.4s, v5.s[2]\n" + "fmla v16.4s, v28.4s, v6.s[3]\n" + "fmla v17.4s, v28.4s, v7.s[0]\n" + "fmla v18.4s, v28.4s, v7.s[1]\n" + "fmla v19.4s, v28.4s, v7.s[2]\n" + "ldr q28, [%x[params], #0x130]\n" + "fmla v12.4s, v27.4s, v5.s[0]\n" + "fmla v13.4s, v27.4s, v5.s[1]\n" + "fmla v14.4s, v27.4s, v5.s[2]\n" + "fmla v15.4s, v27.4s, v5.s[3]\n" + "fmla v16.4s, v27.4s, v7.s[0]\n" + "fmla v17.4s, v27.4s, v7.s[1]\n" + "fmla v18.4s, v27.4s, v7.s[2]\n" + "fmla v19.4s, v27.4s, v7.s[3]\n" + "ldr q27, [%x[params], #0x140]\n" + "fmla v12.4s, v31.4s, v6.s[0]\n" + "fmla v13.4s, v31.4s, v6.s[1]\n" + "fmla v14.4s, v31.4s, v6.s[2]\n" + "fmla v15.4s, v31.4s, v6.s[3]\n" + "fmla v16.4s, v31.4s, v8.s[0]\n" + "fmla v17.4s, v31.4s, v8.s[1]\n" + "fmla v18.4s, v31.4s, v8.s[2]\n" + "fmla v19.4s, v31.4s, v8.s[3]\n" + "ldr q31, [%x[params], #0x150]\n" + "fmla v12.4s, v30.4s, v6.s[1]\n" + "fmla v13.4s, v30.4s, v6.s[2]\n" + "fmla v14.4s, v30.4s, v6.s[3]\n" + "fmla v15.4s, v30.4s, v7.s[0]\n" + "fmla v16.4s, v30.4s, v8.s[1]\n" + "fmla v17.4s, v30.4s, v8.s[2]\n" + "fmla v18.4s, v30.4s, v8.s[3]\n" + "fmla v19.4s, v30.4s, v9.s[0]\n" + "ldr q30, [%x[params], #0x160]\n" + "fmla v12.4s, v29.4s, v6.s[2]\n" + "fmla v13.4s, v29.4s, v6.s[3]\n" + "fmla v14.4s, v29.4s, v7.s[0]\n" + "fmla v15.4s, v29.4s, v7.s[1]\n" + "fmla v16.4s, v29.4s, v8.s[2]\n" + "fmla v17.4s, v29.4s, v8.s[3]\n" + "fmla v18.4s, v29.4s, v9.s[0]\n" + "fmla v19.4s, v29.4s, v9.s[1]\n" + "ldr q29, [%x[params], #0x170]\n" + "fmla v12.4s, v28.4s, v6.s[3]\n" + "fmla v13.4s, v28.4s, v7.s[0]\n" + "fmla v14.4s, v28.4s, v7.s[1]\n" + "fmla v15.4s, v28.4s, v7.s[2]\n" + "fmla v16.4s, v28.4s, v8.s[3]\n" + "fmla v17.4s, v28.4s, v9.s[0]\n" + "fmla v18.4s, v28.4s, v9.s[1]\n" + "fmla v19.4s, v28.4s, v9.s[2]\n" + "ldr q28, [%x[params], #0x180]\n" + "fmla v12.4s, v27.4s, v7.s[0]\n" + "fmla v13.4s, v27.4s, v7.s[1]\n" + "fmla v14.4s, v27.4s, v7.s[2]\n" + "fmla v15.4s, v27.4s, v7.s[3]\n" + "fmla v16.4s, v27.4s, v9.s[0]\n" + "fmla v17.4s, v27.4s, v9.s[1]\n" + "fmla v18.4s, v27.4s, v9.s[2]\n" + "fmla v19.4s, v27.4s, v9.s[3]\n" + "ldr q27, [%x[params], #0x190]\n" + "add %x[params], %x[params], #0x1a0\n" + "fmla v12.4s, v31.4s, v8.s[0]\n" + "fmla v13.4s, v31.4s, v8.s[1]\n" + "fmla v14.4s, v31.4s, v8.s[2]\n" + "fmla v15.4s, v31.4s, v8.s[3]\n" + "fmla v16.4s, v31.4s, v10.s[0]\n" + "fmla v17.4s, v31.4s, v10.s[1]\n" + "fmla v18.4s, v31.4s, v10.s[2]\n" + "fmla v19.4s, v31.4s, v10.s[3]\n" + "fmla v12.4s, v30.4s, v8.s[1]\n" + "fmla v13.4s, v30.4s, v8.s[2]\n" + "fmla v14.4s, v30.4s, v8.s[3]\n" + "fmla v15.4s, v30.4s, v9.s[0]\n" + "fmla v16.4s, v30.4s, v10.s[1]\n" + "fmla v17.4s, v30.4s, v10.s[2]\n" + "fmla v18.4s, v30.4s, v10.s[3]\n" + "fmla v19.4s, v30.4s, v11.s[0]\n" + "fmla v12.4s, v29.4s, v8.s[2]\n" + "fmla v13.4s, v29.4s, v8.s[3]\n" + "fmla v14.4s, v29.4s, v9.s[0]\n" + "fmla v15.4s, v29.4s, v9.s[1]\n" + "fmla v16.4s, v29.4s, v10.s[2]\n" + "fmla v17.4s, v29.4s, v10.s[3]\n" + "fmla v18.4s, v29.4s, v11.s[0]\n" + "fmla v19.4s, v29.4s, v11.s[1]\n" + "fmla v12.4s, v28.4s, v8.s[3]\n" + "fmla v13.4s, v28.4s, v9.s[0]\n" + "fmla v14.4s, v28.4s, v9.s[1]\n" + "fmla v15.4s, v28.4s, v9.s[2]\n" + "fmla v16.4s, v28.4s, v10.s[3]\n" + "fmla v17.4s, v28.4s, v11.s[0]\n" + "fmla v18.4s, v28.4s, v11.s[1]\n" + "fmla v19.4s, v28.4s, v11.s[2]\n" + "fmla v12.4s, v27.4s, v9.s[0]\n" + "fmla v13.4s, v27.4s, v9.s[1]\n" + "fmla v14.4s, v27.4s, v9.s[2]\n" + "fmla v15.4s, v27.4s, v9.s[3]\n" + "fmla v16.4s, v27.4s, v11.s[0]\n" + "fmla v17.4s, v27.4s, v11.s[1]\n" + "fmla v18.4s, v27.4s, v11.s[2]\n" + "fmla v19.4s, v27.4s, v11.s[3]\n" + "fmin v12.4s, v12.4s, v20.4s\n" + "fmin v13.4s, v13.4s, v20.4s\n" + "fmin v14.4s, v14.4s, v20.4s\n" + "fmax v12.4s, v12.4s, v21.4s\n" + "fmax v13.4s, v13.4s, v21.4s\n" + "fmax v14.4s, v14.4s, v21.4s\n" + "fmin v15.4s, v15.4s, v20.4s\n" + "fmin v16.4s, v16.4s, v20.4s\n" + "fmin v17.4s, v17.4s, v20.4s\n" + "fmax v15.4s, v15.4s, v21.4s\n" + "fmax v16.4s, v16.4s, v21.4s\n" + "fmax v17.4s, v17.4s, v21.4s\n" + "fmin v18.4s, v18.4s, v20.4s\n" + "fmin v19.4s, v19.4s, v20.4s\n" + "fmax v18.4s, v18.4s, v21.4s\n" + "fmax v19.4s, v19.4s, v21.4s\n" + "tbz %x[channel_multiplier], #1, 4f\n" + "add x19, x13, x25\n" + "st1 { v12.d }[0], [x19]\n" + "add x19, x12, x25\n" + "st1 { v13.d }[0], [x19]\n" + "add x19, x10, x25\n" + "st1 { v14.d }[0], [x19]\n" + "add x19, x9, x25\n" + "st1 { v15.d }[0], [x19]\n" + "add x19, x27, x25\n" + "st1 { v16.d }[0], [x19]\n" + "add x19, x26, x25\n" + "st1 { v17.d }[0], [x19]\n" + "add x19, x24, x25\n" + "st1 { v18.d }[0], [x19]\n" + "add x19, x23, x25\n" + "st1 { v19.d }[0], [x19]\n" + "add x25, x25, #0x8\n" + "tbz %x[channel_multiplier], #0, 5f\n" + "add x19, x13, x25\n" + "st1 { v12.s }[2], [x19]\n" + "add x19, x12, x25\n" + "st1 { v13.s }[2], [x19]\n" + "add x19, x10, x25\n" + "st1 { v14.s }[2], [x19]\n" + "add x19, x9, x25\n" + "st1 { v15.s }[2], [x19]\n" + "add x19, x27, x25\n" + "st1 { v16.s }[2], [x19]\n" + "add x19, x26, x25\n" + "st1 { v17.s }[2], [x19]\n" + "add x19, x24, x25\n" + "st1 { v18.s }[2], [x19]\n" + "add x19, x23, x25\n" + "st1 { v19.s }[2], [x19]\n" + "b 5f\n" + "4:" // Output channel oddments: Store: Bit 1: Unset + "tbz %x[channel_multiplier], #0, 5f\n" + "add x19, x13, x25\n" + "st1 { v12.s }[0], [x19]\n" + "add x19, x12, x25\n" + "st1 { v13.s }[0], [x19]\n" + "add x19, x10, x25\n" + "st1 { v14.s }[0], [x19]\n" + "add x19, x9, x25\n" + "st1 { v15.s }[0], [x19]\n" + "add x19, x27, x25\n" + "st1 { v16.s }[0], [x19]\n" + "add x19, x26, x25\n" + "st1 { v17.s }[0], [x19]\n" + "add x19, x24, x25\n" + "st1 { v18.s }[0], [x19]\n" + "add x19, x23, x25\n" + "st1 { v19.s }[0], [x19]\n" + "5:" // Output channel oddments: Store: Bit 1: End + + "6:" // End + + : [params] "+&r" (params) + : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp new file mode 100644 index 0000000000..2cc2f7c103 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); + +struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int output_rows(void) { return 2; }; + constexpr static unsigned int output_cols(void) { return 8; }; + + constexpr static unsigned int output_col_regs(void) { return 2; }; + + kern_type kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + + a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..c93037d183 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const float *weights, + const float *bias, + const unsigned int kernel_points, + const unsigned int n_output_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ld1r { v11.4s }, [%x[minmax_vals]]\n" + "mov x10, #0x0\n" + "add x19, %x[minmax_vals], #0x4\n" + "ld1r { v10.4s }, [x19]\n" + "lsr x9, %x[n_output_channels], #0x2\n" + "cbz x9, 8f\n" + "1:" // Output channel loop + "movi v16.16b, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x10, #0x2\n" + "ldr q16, [%x[bias], x19]\n" + "2:" // Output channel loop: Load bias: Done + "mov v9.16b, v16.16b\n" + "ldr q8, [%x[weights], #0x0]\n" + "mov x19, %x[inptrs]\n" + "mov v7.16b, v16.16b\n" + "ldp x24, x28, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "mov v6.16b, v16.16b\n" + "ldr q5, [x24, #0x0]\n" + "mov v4.16b, v16.16b\n" + "add %x[weights], %x[weights], #0x10\n" + "mov v3.16b, v16.16b\n" + "ldr q2, [x24, #0x10]\n" + "mov v1.16b, v16.16b\n" + "ldr q0, [x28, #0x0]\n" + "mov v31.16b, v16.16b\n" + "ldr q30, [x28, #0x10]\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "cbz x20, 6f\n" + "ldp x24, x28, [x19], #0x10\n" + "ldr q20, [%x[weights], #0x0]\n" + "subs x20, x20, #0x1\n" + "add %x[weights], %x[weights], #0x10\n" + "ldr q19, [x24, #0x0]\n" + "ldr q18, [x24, #0x10]\n" + "ldr q17, [x28, #0x0]\n" + "ldr q16, [x28, #0x10]\n" + "beq 4f\n" + "3:" // Output channel loop: Kernel loop + "fmla v9.4s, v8.4s, v5.s[0]\n" + "ldp x24, x28, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "ldr q5, [x24, #0x0]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "ldr q2, [x24, #0x10]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "ldr q0, [x28, #0x0]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "ldr q30, [x28, #0x10]\n" + "fmla v9.4s, v20.4s, v19.s[0]\n" + "ldr q8, [%x[weights], #0x0]\n" + "fmla v7.4s, v20.4s, v19.s[1]\n" + "ldp x24, x28, [x19], #0x10\n" + "fmla v6.4s, v20.4s, v19.s[2]\n" + "fmla v4.4s, v20.4s, v19.s[3]\n" + "ldr q19, [x24, #0x0]\n" + "fmla v3.4s, v20.4s, v18.s[0]\n" + "fmla v1.4s, v20.4s, v18.s[1]\n" + "fmla v31.4s, v20.4s, v18.s[2]\n" + "fmla v29.4s, v20.4s, v18.s[3]\n" + "ldr q18, [x24, #0x10]\n" + "fmla v28.4s, v20.4s, v17.s[0]\n" + "fmla v27.4s, v20.4s, v17.s[1]\n" + "fmla v26.4s, v20.4s, v17.s[2]\n" + "fmla v25.4s, v20.4s, v17.s[3]\n" + "ldr q17, [x28, #0x0]\n" + "fmla v24.4s, v20.4s, v16.s[0]\n" + "fmla v23.4s, v20.4s, v16.s[1]\n" + "fmla v22.4s, v20.4s, v16.s[2]\n" + "fmla v21.4s, v20.4s, v16.s[3]\n" + "ldr q16, [x28, #0x10]\n" + "ldr q20, [%x[weights], #0x10]\n" + "add %x[weights], %x[weights], #0x20\n" + "bgt 3b\n" + "4:" // Output channel loop: Kernel loop tail + "tbnz %x[kernel_points], #0, 5f\n" + "fmla v9.4s, v8.4s, v5.s[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "lsl x27, x10, #0x2\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "fmla v9.4s, v20.4s, v19.s[0]\n" + "fmla v7.4s, v20.4s, v19.s[1]\n" + "fmla v6.4s, v20.4s, v19.s[2]\n" + "fmla v4.4s, v20.4s, v19.s[3]\n" + "fmla v3.4s, v20.4s, v18.s[0]\n" + "fmla v1.4s, v20.4s, v18.s[1]\n" + "fmla v31.4s, v20.4s, v18.s[2]\n" + "fmla v29.4s, v20.4s, v18.s[3]\n" + "fmla v28.4s, v20.4s, v17.s[0]\n" + "fmla v27.4s, v20.4s, v17.s[1]\n" + "fmla v26.4s, v20.4s, v17.s[2]\n" + "fmla v25.4s, v20.4s, v17.s[3]\n" + "fmla v24.4s, v20.4s, v16.s[0]\n" + "fmla v23.4s, v20.4s, v16.s[1]\n" + "fmla v22.4s, v20.4s, v16.s[2]\n" + "fmla v21.4s, v20.4s, v16.s[3]\n" + "fmin v9.4s, v9.4s, v10.4s\n" + "fmin v7.4s, v7.4s, v10.4s\n" + "fmin v6.4s, v6.4s, v10.4s\n" + "fmax v9.4s, v9.4s, v11.4s\n" + "str q9, [x19, x27]\n" + "fmax v7.4s, v7.4s, v11.4s\n" + "fmax v6.4s, v6.4s, v11.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmin v4.4s, v4.4s, v10.4s\n" + "str q7, [x20, x27]\n" + "fmin v3.4s, v3.4s, v10.4s\n" + "fmin v1.4s, v1.4s, v10.4s\n" + "str q6, [x21, x27]\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin v31.4s, v31.4s, v10.4s\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "str q4, [x22, x27]\n" + "fmax v1.4s, v1.4s, v11.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax v31.4s, v31.4s, v11.4s\n" + "str q3, [x23, x27]\n" + "fmin v29.4s, v29.4s, v10.4s\n" + "str q1, [x24, x27]\n" + "fmin v28.4s, v28.4s, v10.4s\n" + "str q31, [x25, x27]\n" + "fmin v27.4s, v27.4s, v10.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmax v29.4s, v29.4s, v11.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmax v28.4s, v28.4s, v11.4s\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax v27.4s, v27.4s, v11.4s\n" + "str q29, [x26, x27]\n" + "fmin v26.4s, v26.4s, v10.4s\n" + "str q28, [x19, x27]\n" + "fmin v25.4s, v25.4s, v10.4s\n" + "str q27, [x20, x27]\n" + "fmin v24.4s, v24.4s, v10.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax v26.4s, v26.4s, v11.4s\n" + "str q26, [x21, x27]\n" + "fmax v25.4s, v25.4s, v11.4s\n" + "fmax v24.4s, v24.4s, v11.4s\n" + "str q25, [x22, x27]\n" + "fmin v23.4s, v23.4s, v10.4s\n" + "fmin v22.4s, v22.4s, v10.4s\n" + "str q24, [x23, x27]\n" + "fmin v21.4s, v21.4s, v10.4s\n" + "fmax v23.4s, v23.4s, v11.4s\n" + "str q23, [x24, x27]\n" + "fmax v22.4s, v22.4s, v11.4s\n" + "fmax v21.4s, v21.4s, v11.4s\n" + "str q22, [x25, x27]\n" + "str q21, [x26, x27]\n" + "b 7f\n" + "5:" // Output channel loop: Odd tail + "fmla v9.4s, v8.4s, v5.s[0]\n" + "ldp x24, x28, [x19], #0x10\n" + "lsl x27, x10, #0x2\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "ldr q5, [x24, #0x0]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "ldr q2, [x24, #0x10]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "ldr q0, [x28, #0x0]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "ldr q30, [x28, #0x10]\n" + "fmla v9.4s, v20.4s, v19.s[0]\n" + "ldr q8, [%x[weights], #0x0]\n" + "add %x[weights], %x[weights], #0x10\n" + "fmla v7.4s, v20.4s, v19.s[1]\n" + "fmla v6.4s, v20.4s, v19.s[2]\n" + "fmla v4.4s, v20.4s, v19.s[3]\n" + "fmla v3.4s, v20.4s, v18.s[0]\n" + "fmla v1.4s, v20.4s, v18.s[1]\n" + "fmla v31.4s, v20.4s, v18.s[2]\n" + "fmla v29.4s, v20.4s, v18.s[3]\n" + "fmla v28.4s, v20.4s, v17.s[0]\n" + "fmla v27.4s, v20.4s, v17.s[1]\n" + "fmla v26.4s, v20.4s, v17.s[2]\n" + "fmla v25.4s, v20.4s, v17.s[3]\n" + "fmla v24.4s, v20.4s, v16.s[0]\n" + "fmla v23.4s, v20.4s, v16.s[1]\n" + "fmla v22.4s, v20.4s, v16.s[2]\n" + "fmla v21.4s, v20.4s, v16.s[3]\n" + "fmla v9.4s, v8.4s, v5.s[0]\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "fmin v9.4s, v9.4s, v10.4s\n" + "fmin v7.4s, v7.4s, v10.4s\n" + "fmin v6.4s, v6.4s, v10.4s\n" + "fmax v9.4s, v9.4s, v11.4s\n" + "str q9, [x19, x27]\n" + "fmax v7.4s, v7.4s, v11.4s\n" + "fmax v6.4s, v6.4s, v11.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmin v4.4s, v4.4s, v10.4s\n" + "str q7, [x20, x27]\n" + "fmin v3.4s, v3.4s, v10.4s\n" + "fmin v1.4s, v1.4s, v10.4s\n" + "str q6, [x21, x27]\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin v31.4s, v31.4s, v10.4s\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "str q4, [x22, x27]\n" + "fmax v1.4s, v1.4s, v11.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax v31.4s, v31.4s, v11.4s\n" + "str q3, [x23, x27]\n" + "fmin v29.4s, v29.4s, v10.4s\n" + "str q1, [x24, x27]\n" + "fmin v28.4s, v28.4s, v10.4s\n" + "str q31, [x25, x27]\n" + "fmin v27.4s, v27.4s, v10.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmax v29.4s, v29.4s, v11.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmax v28.4s, v28.4s, v11.4s\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax v27.4s, v27.4s, v11.4s\n" + "str q29, [x26, x27]\n" + "fmin v26.4s, v26.4s, v10.4s\n" + "str q28, [x19, x27]\n" + "fmin v25.4s, v25.4s, v10.4s\n" + "str q27, [x20, x27]\n" + "fmin v24.4s, v24.4s, v10.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax v26.4s, v26.4s, v11.4s\n" + "str q26, [x21, x27]\n" + "fmax v25.4s, v25.4s, v11.4s\n" + "fmax v24.4s, v24.4s, v11.4s\n" + "str q25, [x22, x27]\n" + "fmin v23.4s, v23.4s, v10.4s\n" + "fmin v22.4s, v22.4s, v10.4s\n" + "str q24, [x23, x27]\n" + "fmin v21.4s, v21.4s, v10.4s\n" + "fmax v23.4s, v23.4s, v11.4s\n" + "str q23, [x24, x27]\n" + "fmax v22.4s, v22.4s, v11.4s\n" + "fmax v21.4s, v21.4s, v11.4s\n" + "str q22, [x25, x27]\n" + "str q21, [x26, x27]\n" + "b 7f\n" + "6:" // Output channel loop: Single kernel point + "fmla v9.4s, v8.4s, v5.s[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "lsl x27, x10, #0x2\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "fmin v9.4s, v9.4s, v10.4s\n" + "fmin v7.4s, v7.4s, v10.4s\n" + "fmin v6.4s, v6.4s, v10.4s\n" + "fmax v9.4s, v9.4s, v11.4s\n" + "str q9, [x19, x27]\n" + "fmax v7.4s, v7.4s, v11.4s\n" + "fmax v6.4s, v6.4s, v11.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmin v4.4s, v4.4s, v10.4s\n" + "str q7, [x20, x27]\n" + "fmin v3.4s, v3.4s, v10.4s\n" + "fmin v1.4s, v1.4s, v10.4s\n" + "str q6, [x21, x27]\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin v31.4s, v31.4s, v10.4s\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "str q4, [x22, x27]\n" + "fmax v1.4s, v1.4s, v11.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax v31.4s, v31.4s, v11.4s\n" + "str q3, [x23, x27]\n" + "fmin v29.4s, v29.4s, v10.4s\n" + "str q1, [x24, x27]\n" + "fmin v28.4s, v28.4s, v10.4s\n" + "str q31, [x25, x27]\n" + "fmin v27.4s, v27.4s, v10.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmax v29.4s, v29.4s, v11.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmax v28.4s, v28.4s, v11.4s\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax v27.4s, v27.4s, v11.4s\n" + "str q29, [x26, x27]\n" + "fmin v26.4s, v26.4s, v10.4s\n" + "str q28, [x19, x27]\n" + "fmin v25.4s, v25.4s, v10.4s\n" + "str q27, [x20, x27]\n" + "fmin v24.4s, v24.4s, v10.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax v26.4s, v26.4s, v11.4s\n" + "str q26, [x21, x27]\n" + "fmax v25.4s, v25.4s, v11.4s\n" + "fmax v24.4s, v24.4s, v11.4s\n" + "str q25, [x22, x27]\n" + "fmin v23.4s, v23.4s, v10.4s\n" + "fmin v22.4s, v22.4s, v10.4s\n" + "str q24, [x23, x27]\n" + "fmin v21.4s, v21.4s, v10.4s\n" + "fmax v23.4s, v23.4s, v11.4s\n" + "str q23, [x24, x27]\n" + "fmax v22.4s, v22.4s, v11.4s\n" + "fmax v21.4s, v21.4s, v11.4s\n" + "str q22, [x25, x27]\n" + "str q21, [x26, x27]\n" + "7:" // Output channel loop: Done + "add x10, x10, #0x4\n" + "cmp x10, x9, LSL #2\n" + "blt 1b\n" + "tst %x[n_output_channels], #0x3\n" + "beq 19f\n" + "8:" // Output channel oddments + "movi v16.16b, #0x0\n" + "cbz %x[bias], 11f\n" + "add x19, %x[bias], x10, LSL #2\n" + "tbz %x[n_output_channels], #1, 9f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 10f\n" + "ld1 { v16.s }[2], [x19]\n" + "b 10f\n" + "9:" // Output channel oddments: Load bias: Bit 1: Unset + "tbz %x[n_output_channels], #0, 10f\n" + "ld1 { v16.s }[0], [x19]\n" + "10:" // Output channel oddments: Load bias: Bit 1: End + + "11:" // Output channel oddments: Load bias: Done + "mov v9.16b, v16.16b\n" + "ldr q8, [%x[weights], #0x0]\n" + "mov x19, %x[inptrs]\n" + "mov v7.16b, v16.16b\n" + "ldp x24, x28, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "mov v6.16b, v16.16b\n" + "ldr q5, [x24, #0x0]\n" + "mov v4.16b, v16.16b\n" + "add %x[weights], %x[weights], #0x10\n" + "mov v3.16b, v16.16b\n" + "ldr q2, [x24, #0x10]\n" + "mov v1.16b, v16.16b\n" + "ldr q0, [x28, #0x0]\n" + "mov v31.16b, v16.16b\n" + "ldr q30, [x28, #0x10]\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "cbz x20, 15f\n" + "ldp x24, x28, [x19], #0x10\n" + "ldr q20, [%x[weights], #0x0]\n" + "subs x20, x20, #0x1\n" + "add %x[weights], %x[weights], #0x10\n" + "ldr q19, [x24, #0x0]\n" + "ldr q18, [x24, #0x10]\n" + "ldr q17, [x28, #0x0]\n" + "ldr q16, [x28, #0x10]\n" + "beq 13f\n" + "12:" // Output channel oddments: Kernel loop + "fmla v9.4s, v8.4s, v5.s[0]\n" + "ldp x24, x28, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "ldr q5, [x24, #0x0]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "ldr q2, [x24, #0x10]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "ldr q0, [x28, #0x0]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "ldr q30, [x28, #0x10]\n" + "fmla v9.4s, v20.4s, v19.s[0]\n" + "ldr q8, [%x[weights], #0x0]\n" + "fmla v7.4s, v20.4s, v19.s[1]\n" + "ldp x24, x28, [x19], #0x10\n" + "fmla v6.4s, v20.4s, v19.s[2]\n" + "fmla v4.4s, v20.4s, v19.s[3]\n" + "ldr q19, [x24, #0x0]\n" + "fmla v3.4s, v20.4s, v18.s[0]\n" + "fmla v1.4s, v20.4s, v18.s[1]\n" + "fmla v31.4s, v20.4s, v18.s[2]\n" + "fmla v29.4s, v20.4s, v18.s[3]\n" + "ldr q18, [x24, #0x10]\n" + "fmla v28.4s, v20.4s, v17.s[0]\n" + "fmla v27.4s, v20.4s, v17.s[1]\n" + "fmla v26.4s, v20.4s, v17.s[2]\n" + "fmla v25.4s, v20.4s, v17.s[3]\n" + "ldr q17, [x28, #0x0]\n" + "fmla v24.4s, v20.4s, v16.s[0]\n" + "fmla v23.4s, v20.4s, v16.s[1]\n" + "fmla v22.4s, v20.4s, v16.s[2]\n" + "fmla v21.4s, v20.4s, v16.s[3]\n" + "ldr q16, [x28, #0x10]\n" + "ldr q20, [%x[weights], #0x10]\n" + "add %x[weights], %x[weights], #0x20\n" + "bgt 12b\n" + "13:" // Output channel oddments: Kernel loop tail + "tbnz %x[kernel_points], #0, 14f\n" + "fmla v9.4s, v8.4s, v5.s[0]\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "fmla v9.4s, v20.4s, v19.s[0]\n" + "fmla v7.4s, v20.4s, v19.s[1]\n" + "fmla v6.4s, v20.4s, v19.s[2]\n" + "fmla v4.4s, v20.4s, v19.s[3]\n" + "fmla v3.4s, v20.4s, v18.s[0]\n" + "fmla v1.4s, v20.4s, v18.s[1]\n" + "fmla v31.4s, v20.4s, v18.s[2]\n" + "fmla v29.4s, v20.4s, v18.s[3]\n" + "fmla v28.4s, v20.4s, v17.s[0]\n" + "fmla v27.4s, v20.4s, v17.s[1]\n" + "fmla v26.4s, v20.4s, v17.s[2]\n" + "fmla v25.4s, v20.4s, v17.s[3]\n" + "fmla v24.4s, v20.4s, v16.s[0]\n" + "fmla v23.4s, v20.4s, v16.s[1]\n" + "fmla v22.4s, v20.4s, v16.s[2]\n" + "fmla v21.4s, v20.4s, v16.s[3]\n" + "b 16f\n" + "14:" // Output channel oddments: Odd tail + "fmla v9.4s, v8.4s, v5.s[0]\n" + "ldp x24, x28, [x19], #0x10\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "ldr q5, [x24, #0x0]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "ldr q2, [x24, #0x10]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "ldr q0, [x28, #0x0]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "ldr q30, [x28, #0x10]\n" + "fmla v9.4s, v20.4s, v19.s[0]\n" + "ldr q8, [%x[weights], #0x0]\n" + "add %x[weights], %x[weights], #0x10\n" + "fmla v7.4s, v20.4s, v19.s[1]\n" + "fmla v6.4s, v20.4s, v19.s[2]\n" + "fmla v4.4s, v20.4s, v19.s[3]\n" + "fmla v3.4s, v20.4s, v18.s[0]\n" + "fmla v1.4s, v20.4s, v18.s[1]\n" + "fmla v31.4s, v20.4s, v18.s[2]\n" + "fmla v29.4s, v20.4s, v18.s[3]\n" + "fmla v28.4s, v20.4s, v17.s[0]\n" + "fmla v27.4s, v20.4s, v17.s[1]\n" + "fmla v26.4s, v20.4s, v17.s[2]\n" + "fmla v25.4s, v20.4s, v17.s[3]\n" + "fmla v24.4s, v20.4s, v16.s[0]\n" + "fmla v23.4s, v20.4s, v16.s[1]\n" + "fmla v22.4s, v20.4s, v16.s[2]\n" + "fmla v21.4s, v20.4s, v16.s[3]\n" + "fmla v9.4s, v8.4s, v5.s[0]\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "b 16f\n" + "15:" // Output channel oddments: Single kernel point + "fmla v9.4s, v8.4s, v5.s[0]\n" + "fmla v7.4s, v8.4s, v5.s[1]\n" + "fmla v6.4s, v8.4s, v5.s[2]\n" + "fmla v4.4s, v8.4s, v5.s[3]\n" + "fmla v3.4s, v8.4s, v2.s[0]\n" + "fmla v1.4s, v8.4s, v2.s[1]\n" + "fmla v31.4s, v8.4s, v2.s[2]\n" + "fmla v29.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v0.s[0]\n" + "fmla v27.4s, v8.4s, v0.s[1]\n" + "fmla v26.4s, v8.4s, v0.s[2]\n" + "fmla v25.4s, v8.4s, v0.s[3]\n" + "fmla v24.4s, v8.4s, v30.s[0]\n" + "fmla v23.4s, v8.4s, v30.s[1]\n" + "fmla v22.4s, v8.4s, v30.s[2]\n" + "fmla v21.4s, v8.4s, v30.s[3]\n" + "16:" // Output channel oddments: Done + "fmin v9.4s, v9.4s, v10.4s\n" + "fmin v7.4s, v7.4s, v10.4s\n" + "fmin v6.4s, v6.4s, v10.4s\n" + "fmin v4.4s, v4.4s, v10.4s\n" + "fmax v9.4s, v9.4s, v11.4s\n" + "fmax v7.4s, v7.4s, v11.4s\n" + "fmax v6.4s, v6.4s, v11.4s\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "fmin v3.4s, v3.4s, v10.4s\n" + "fmin v1.4s, v1.4s, v10.4s\n" + "fmin v31.4s, v31.4s, v10.4s\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "fmax v1.4s, v1.4s, v11.4s\n" + "fmax v31.4s, v31.4s, v11.4s\n" + "fmin v29.4s, v29.4s, v10.4s\n" + "fmin v28.4s, v28.4s, v10.4s\n" + "fmin v27.4s, v27.4s, v10.4s\n" + "fmax v29.4s, v29.4s, v11.4s\n" + "fmax v28.4s, v28.4s, v11.4s\n" + "fmax v27.4s, v27.4s, v11.4s\n" + "fmin v26.4s, v26.4s, v10.4s\n" + "fmin v25.4s, v25.4s, v10.4s\n" + "fmin v24.4s, v24.4s, v10.4s\n" + "fmax v26.4s, v26.4s, v11.4s\n" + "fmax v25.4s, v25.4s, v11.4s\n" + "fmax v24.4s, v24.4s, v11.4s\n" + "fmin v23.4s, v23.4s, v10.4s\n" + "fmin v22.4s, v22.4s, v10.4s\n" + "fmin v21.4s, v21.4s, v10.4s\n" + "fmax v23.4s, v23.4s, v11.4s\n" + "fmax v22.4s, v22.4s, v11.4s\n" + "fmax v21.4s, v21.4s, v11.4s\n" + "tbz %x[n_output_channels], #1, 17f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #2\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #2\n" + "st1 { v9.d }[0], [x19]\n" + "add x21, x21, x10, LSL #2\n" + "st1 { v7.d }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #2\n" + "st1 { v6.d }[0], [x21]\n" + "add x23, x23, x10, LSL #2\n" + "st1 { v4.d }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #2\n" + "st1 { v3.d }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #2\n" + "st1 { v1.d }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #2\n" + "st1 { v31.d }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #2\n" + "st1 { v29.d }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #2\n" + "st1 { v28.d }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #2\n" + "st1 { v27.d }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #2\n" + "st1 { v26.d }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #2\n" + "st1 { v25.d }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #2\n" + "st1 { v24.d }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #2\n" + "st1 { v23.d }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #2\n" + "st1 { v22.d }[0], [x25]\n" + "add x10, x10, #0x2\n" + "st1 { v21.d }[0], [x26]\n" + "tbz %x[n_output_channels], #0, 18f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #2\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #2\n" + "st1 { v9.s }[2], [x19]\n" + "add x21, x21, x10, LSL #2\n" + "st1 { v7.s }[2], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #2\n" + "st1 { v6.s }[2], [x21]\n" + "add x23, x23, x10, LSL #2\n" + "st1 { v4.s }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #2\n" + "st1 { v3.s }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #2\n" + "st1 { v1.s }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #2\n" + "st1 { v31.s }[2], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #2\n" + "st1 { v29.s }[2], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #2\n" + "st1 { v28.s }[2], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #2\n" + "st1 { v27.s }[2], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #2\n" + "st1 { v26.s }[2], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #2\n" + "st1 { v25.s }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #2\n" + "st1 { v24.s }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #2\n" + "st1 { v23.s }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #2\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v21.s }[2], [x26]\n" + "b 18f\n" + "17:" // Output channel oddments: Done: Store: Bit 1: Unset + "tbz %x[n_output_channels], #0, 18f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x10, LSL #2\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x10, LSL #2\n" + "st1 { v9.s }[0], [x19]\n" + "add x21, x21, x10, LSL #2\n" + "st1 { v7.s }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x10, LSL #2\n" + "st1 { v6.s }[0], [x21]\n" + "add x23, x23, x10, LSL #2\n" + "st1 { v4.s }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x10, LSL #2\n" + "st1 { v3.s }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x10, LSL #2\n" + "st1 { v1.s }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x10, LSL #2\n" + "st1 { v31.s }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x10, LSL #2\n" + "st1 { v29.s }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x10, LSL #2\n" + "st1 { v28.s }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x10, LSL #2\n" + "st1 { v27.s }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x10, LSL #2\n" + "st1 { v26.s }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x10, LSL #2\n" + "st1 { v25.s }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x10, LSL #2\n" + "st1 { v24.s }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x10, LSL #2\n" + "st1 { v23.s }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x10, LSL #2\n" + "st1 { v22.s }[0], [x25]\n" + "st1 { v21.s }[0], [x26]\n" + "18:" // Output channel oddments: Done: Store: Bit 1: End + + "19:" // Done + + : [weights] "+&r" (weights) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..c76cb9906f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + +struct a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size; + + kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + + a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..ed8cd4861e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -0,0 +1,1318 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__aarch64__) + +#include "arm_gemm.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +{ + __asm__ __volatile__( + "ldp x13, x12, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "ldp x11, x10, [%x[inptrs], #0x10]\n" + "mov x19, #0x1\n" + "ldp x9, x28, [%x[inptrs], #0x20]\n" + "orr x19, x19, #0x100\n" + "ldp x27, x26, [%x[inptrs], #0x30]\n" + "orr x19, x19, #0x10000\n" + "dup v11.4s, w19\n" + "ldp x25, x24, [%x[outptrs], #0x0]\n" + "mov x23, #0x0\n" + "ldp x22, x21, [%x[outptrs], #0x10]\n" + "lsr x20, %x[n_channels], #0x4\n" + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v9.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n" + "ld1r { v12.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "ld1r { v14.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v13.4s }, [x19]\n" + "cbz x20, 2f\n" + "1:" // Loop + "movi v15.4s, #0x0\n" + "ldr q27, [x13, x23]\n" + "subs x20, x20, #0x1\n" + "movi v10.4s, #0x0\n" + "ldr q1, [x12, x23]\n" + "ldp x13, x12, [%x[inptrs], #0x40]\n" + "ldr q25, [x11, x23]\n" + "zip1 v7.16b, v27.16b, v25.16b\n" + "ldr q23, [x10, x23]\n" + "zip2 v5.16b, v27.16b, v25.16b\n" + "ldp x11, x10, [%x[inptrs], #0x50]\n" + "ldr q31, [x9, x23]\n" + "zip1 v8.16b, v1.16b, v23.16b\n" + "ldr q28, [x28, x23]\n" + "zip2 v3.16b, v1.16b, v23.16b\n" + "ldp x9, x28, [%x[inptrs], #0x60]\n" + "zip1 v6.16b, v7.16b, v8.16b\n" + "ldr q21, [x27, x23]\n" + "zip2 v8.16b, v7.16b, v8.16b\n" + "ldr q26, [x26, x23]\n" + "zip1 v7.16b, v5.16b, v3.16b\n" + "ldp x27, x26, [%x[inptrs], #0x70]\n" + "zip2 v5.16b, v5.16b, v3.16b\n" + "ldr q24, [x13, x23]\n" + "ldr q22, [x12, x23]\n" + "zip1 v2.16b, v31.16b, v21.16b\n" + "zip2 v4.16b, v31.16b, v21.16b\n" + "ldp x13, x12, [%x[inptrs], #0x0]\n" + "zip1 v1.16b, v28.16b, v26.16b\n" + "ldr q20, [x11, x23]\n" + "zip2 v31.16b, v28.16b, v26.16b\n" + "ldr q16, [x10, x23]\n" + "zip1 v3.16b, v2.16b, v1.16b\n" + "ldp x11, x10, [%x[inptrs], #0x10]\n" + "zip2 v2.16b, v2.16b, v1.16b\n" + "ldr q19, [x9, x23]\n" + "zip1 v1.16b, v4.16b, v31.16b\n" + "ldr q0, [x28, x23]\n" + "zip1 v28.16b, v24.16b, v20.16b\n" + "ldp x9, x28, [%x[inptrs], #0x20]\n" + "zip2 v26.16b, v24.16b, v20.16b\n" + "ldr q18, [x27, x23]\n" + "zip1 v24.16b, v22.16b, v16.16b\n" + "ldr q17, [x26, x23]\n" + "zip2 v22.16b, v22.16b, v16.16b\n" + "ldp x27, x26, [%x[inptrs], #0x30]\n" + "zip2 v16.16b, v4.16b, v31.16b\n" + "str q7, [SP, #0x0]\n" + "zip1 v31.16b, v28.16b, v24.16b\n" + "str q5, [SP, #0x10]\n" + "zip1 v20.16b, v19.16b, v18.16b\n" + "str q1, [SP, #0x20]\n" + "zip2 v19.16b, v19.16b, v18.16b\n" + "str q16, [SP, #0x30]\n" + "zip1 v18.16b, v0.16b, v17.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "zip2 v17.16b, v0.16b, v17.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "zip2 v28.16b, v28.16b, v24.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "zip1 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x40]\n" + "zip2 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x50]\n" + "zip1 v26.16b, v20.16b, v18.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "zip2 v24.16b, v20.16b, v18.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x60]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x70]\n" + "mov v22.16b, v30.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "mov v20.16b, v30.16b\n" + "mov v19.16b, v30.16b\n" + ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + "ldr q29, [%x[params], #0x70]\n" + ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + "ldr q3, [SP, #0x20]\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + "ldr q27, [%x[params], #0x80]\n" + ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + "ldr q31, [SP, #0x40]\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "ldr q25, [%x[params], #0x90]\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n" + "ldr q6, [SP, #0x0]\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "ldr q26, [SP, #0x60]\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "movi v15.4s, #0x0\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0xa0]\n" + ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "mov v17.16b, v15.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "ldr q30, [%x[params], #0x60]\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0xb0]\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "add v19.4s, v19.4s, v13.4s\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x22, x23]\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "mov v19.16b, v30.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + "movi v10.4s, #0x0\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + "mls v30.4s, v15.4s, v14.4s\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + "ldr q29, [%x[params], #0xd0]\n" + ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + "ldr q2, [SP, #0x30]\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + "ldr q27, [%x[params], #0xe0]\n" + ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + "ldr q28, [SP, #0x50]\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "ldr q25, [%x[params], #0xf0]\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n" + "ldr q8, [SP, #0x10]\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "ldr q24, [SP, #0x70]\n" + "and v18.16b, v30.16b, v21.16b\n" + "mls v19.4s, v17.4s, v14.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "movi v15.4s, #0x0\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n" + "movi v10.4s, #0x0\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0x100]\n" + ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "mov v17.16b, v15.16b\n" + "smax v30.4s, v30.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0x110]\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "ldr q30, [%x[params], #0xc0]\n" + "add v19.4s, v19.4s, v13.4s\n" + "str s20, [x22, x23]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "smax v19.4s, v19.4s, v9.4s\n" + ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "mov v19.16b, v30.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + "mls v30.4s, v15.4s, v14.4s\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + "ldr q29, [%x[params], #0x130]\n" + ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + "ldr q27, [%x[params], #0x140]\n" + ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "ldr q25, [%x[params], #0x150]\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v16.16b, v20.16b, v21.16b\n" + "movi v15.4s, #0x0\n" + "mls v19.4s, v17.4s, v14.4s\n" + ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "movi v10.4s, #0x0\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0x160]\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "mov v17.16b, v15.16b\n" + "smax v30.4s, v30.4s, v9.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v9.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0x170]\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "ldr q30, [%x[params], #0x120]\n" + "add %x[params], %x[params], #0x180\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x22, x23]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "smin v19.4s, v19.4s, v12.4s\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "mov v19.16b, v30.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + "mls v30.4s, v15.4s, v14.4s\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v16.16b, v20.16b, v21.16b\n" + "mls v19.4s, v17.4s, v14.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v9.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x22, x23]\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "add x23, x23, #0x4\n" + "bgt 1b\n" + "tst %x[n_channels], #0xf\n" + "beq 34f\n" + "2:" // Oddments + "and x19, %x[n_channels], #0xf\n" + "add x13, x13, x23\n" + "add x12, x12, x23\n" + "add x11, x11, x23\n" + "add x10, x10, x23\n" + "add x9, x9, x23\n" + "add x28, x28, x23\n" + "add x27, x27, x23\n" + "add x26, x26, x23\n" + "tbz %x[n_channels], #3, 6f\n" + "ld1 { v27.d }[0], [x13], #0x8\n" + "ld1 { v1.d }[0], [x12], #0x8\n" + "ld1 { v25.d }[0], [x11], #0x8\n" + "ld1 { v23.d }[0], [x10], #0x8\n" + "ld1 { v31.d }[0], [x9], #0x8\n" + "ld1 { v28.d }[0], [x28], #0x8\n" + "ld1 { v21.d }[0], [x27], #0x8\n" + "ld1 { v26.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #2, 4f\n" + "ld1 { v27.s }[2], [x13], #0x4\n" + "ld1 { v1.s }[2], [x12], #0x4\n" + "ld1 { v25.s }[2], [x11], #0x4\n" + "ld1 { v23.s }[2], [x10], #0x4\n" + "ld1 { v31.s }[2], [x9], #0x4\n" + "ld1 { v28.s }[2], [x28], #0x4\n" + "ld1 { v21.s }[2], [x27], #0x4\n" + "ld1 { v26.s }[2], [x26], #0x4\n" + "tbz %x[n_channels], #1, 3f\n" + "ld1 { v27.h }[6], [x13], #0x2\n" + "ld1 { v1.h }[6], [x12], #0x2\n" + "ld1 { v25.h }[6], [x11], #0x2\n" + "ld1 { v23.h }[6], [x10], #0x2\n" + "ld1 { v31.h }[6], [x9], #0x2\n" + "ld1 { v28.h }[6], [x28], #0x2\n" + "ld1 { v21.h }[6], [x27], #0x2\n" + "ld1 { v26.h }[6], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[14], [x13], #0x1\n" + "ld1 { v1.b }[14], [x12], #0x1\n" + "ld1 { v25.b }[14], [x11], #0x1\n" + "ld1 { v23.b }[14], [x10], #0x1\n" + "ld1 { v31.b }[14], [x9], #0x1\n" + "ld1 { v28.b }[14], [x28], #0x1\n" + "ld1 { v21.b }[14], [x27], #0x1\n" + "ld1 { v26.b }[14], [x26], #0x1\n" + "b 10f\n" + "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[12], [x13], #0x1\n" + "ld1 { v1.b }[12], [x12], #0x1\n" + "ld1 { v25.b }[12], [x11], #0x1\n" + "ld1 { v23.b }[12], [x10], #0x1\n" + "ld1 { v31.b }[12], [x9], #0x1\n" + "ld1 { v28.b }[12], [x28], #0x1\n" + "ld1 { v21.b }[12], [x27], #0x1\n" + "ld1 { v26.b }[12], [x26], #0x1\n" + "b 10f\n" + "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset + "tbz %x[n_channels], #1, 5f\n" + "ld1 { v27.h }[4], [x13], #0x2\n" + "ld1 { v1.h }[4], [x12], #0x2\n" + "ld1 { v25.h }[4], [x11], #0x2\n" + "ld1 { v23.h }[4], [x10], #0x2\n" + "ld1 { v31.h }[4], [x9], #0x2\n" + "ld1 { v28.h }[4], [x28], #0x2\n" + "ld1 { v21.h }[4], [x27], #0x2\n" + "ld1 { v26.h }[4], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[10], [x13], #0x1\n" + "ld1 { v1.b }[10], [x12], #0x1\n" + "ld1 { v25.b }[10], [x11], #0x1\n" + "ld1 { v23.b }[10], [x10], #0x1\n" + "ld1 { v31.b }[10], [x9], #0x1\n" + "ld1 { v28.b }[10], [x28], #0x1\n" + "ld1 { v21.b }[10], [x27], #0x1\n" + "ld1 { v26.b }[10], [x26], #0x1\n" + "b 10f\n" + "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[8], [x13], #0x1\n" + "ld1 { v1.b }[8], [x12], #0x1\n" + "ld1 { v25.b }[8], [x11], #0x1\n" + "ld1 { v23.b }[8], [x10], #0x1\n" + "ld1 { v31.b }[8], [x9], #0x1\n" + "ld1 { v28.b }[8], [x28], #0x1\n" + "ld1 { v21.b }[8], [x27], #0x1\n" + "ld1 { v26.b }[8], [x26], #0x1\n" + "b 10f\n" + "6:" // Oddments: Load (A): Bit 3: Unset + "tbz %x[n_channels], #2, 8f\n" + "ld1 { v27.s }[0], [x13], #0x4\n" + "ld1 { v1.s }[0], [x12], #0x4\n" + "ld1 { v25.s }[0], [x11], #0x4\n" + "ld1 { v23.s }[0], [x10], #0x4\n" + "ld1 { v31.s }[0], [x9], #0x4\n" + "ld1 { v28.s }[0], [x28], #0x4\n" + "ld1 { v21.s }[0], [x27], #0x4\n" + "ld1 { v26.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #1, 7f\n" + "ld1 { v27.h }[2], [x13], #0x2\n" + "ld1 { v1.h }[2], [x12], #0x2\n" + "ld1 { v25.h }[2], [x11], #0x2\n" + "ld1 { v23.h }[2], [x10], #0x2\n" + "ld1 { v31.h }[2], [x9], #0x2\n" + "ld1 { v28.h }[2], [x28], #0x2\n" + "ld1 { v21.h }[2], [x27], #0x2\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[6], [x13], #0x1\n" + "ld1 { v1.b }[6], [x12], #0x1\n" + "ld1 { v25.b }[6], [x11], #0x1\n" + "ld1 { v23.b }[6], [x10], #0x1\n" + "ld1 { v31.b }[6], [x9], #0x1\n" + "ld1 { v28.b }[6], [x28], #0x1\n" + "ld1 { v21.b }[6], [x27], #0x1\n" + "ld1 { v26.b }[6], [x26], #0x1\n" + "b 10f\n" + "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[4], [x13], #0x1\n" + "ld1 { v1.b }[4], [x12], #0x1\n" + "ld1 { v25.b }[4], [x11], #0x1\n" + "ld1 { v23.b }[4], [x10], #0x1\n" + "ld1 { v31.b }[4], [x9], #0x1\n" + "ld1 { v28.b }[4], [x28], #0x1\n" + "ld1 { v21.b }[4], [x27], #0x1\n" + "ld1 { v26.b }[4], [x26], #0x1\n" + "b 10f\n" + "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset + "tbz %x[n_channels], #1, 9f\n" + "ld1 { v27.h }[0], [x13], #0x2\n" + "ld1 { v1.h }[0], [x12], #0x2\n" + "ld1 { v25.h }[0], [x11], #0x2\n" + "ld1 { v23.h }[0], [x10], #0x2\n" + "ld1 { v31.h }[0], [x9], #0x2\n" + "ld1 { v28.h }[0], [x28], #0x2\n" + "ld1 { v21.h }[0], [x27], #0x2\n" + "ld1 { v26.h }[0], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[2], [x13], #0x1\n" + "ld1 { v1.b }[2], [x12], #0x1\n" + "ld1 { v25.b }[2], [x11], #0x1\n" + "ld1 { v23.b }[2], [x10], #0x1\n" + "ld1 { v31.b }[2], [x9], #0x1\n" + "ld1 { v28.b }[2], [x28], #0x1\n" + "ld1 { v21.b }[2], [x27], #0x1\n" + "ld1 { v26.b }[2], [x26], #0x1\n" + "b 10f\n" + "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[0], [x13], #0x1\n" + "ld1 { v1.b }[0], [x12], #0x1\n" + "ld1 { v25.b }[0], [x11], #0x1\n" + "ld1 { v23.b }[0], [x10], #0x1\n" + "ld1 { v31.b }[0], [x9], #0x1\n" + "ld1 { v28.b }[0], [x28], #0x1\n" + "ld1 { v21.b }[0], [x27], #0x1\n" + "ld1 { v26.b }[0], [x26], #0x1\n" + "10:" // Oddments: Load (A): Bit 3: End + "ldp x13, x12, [%x[inptrs], #0x40]\n" + "add x13, x13, x23\n" + "ldp x11, x10, [%x[inptrs], #0x50]\n" + "ldp x9, x28, [%x[inptrs], #0x60]\n" + "add x12, x12, x23\n" + "ldp x27, x26, [%x[inptrs], #0x70]\n" + "add x11, x11, x23\n" + "add x10, x10, x23\n" + "add x9, x9, x23\n" + "add x28, x28, x23\n" + "add x27, x27, x23\n" + "add x26, x26, x23\n" + "tbz %x[n_channels], #3, 14f\n" + "ld1 { v24.d }[0], [x13], #0x8\n" + "ld1 { v22.d }[0], [x12], #0x8\n" + "ld1 { v20.d }[0], [x11], #0x8\n" + "ld1 { v16.d }[0], [x10], #0x8\n" + "ld1 { v19.d }[0], [x9], #0x8\n" + "ld1 { v0.d }[0], [x28], #0x8\n" + "ld1 { v18.d }[0], [x27], #0x8\n" + "ld1 { v17.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #2, 12f\n" + "ld1 { v24.s }[2], [x13], #0x4\n" + "ld1 { v22.s }[2], [x12], #0x4\n" + "ld1 { v20.s }[2], [x11], #0x4\n" + "ld1 { v16.s }[2], [x10], #0x4\n" + "ld1 { v19.s }[2], [x9], #0x4\n" + "ld1 { v0.s }[2], [x28], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "ld1 { v17.s }[2], [x26], #0x4\n" + "tbz %x[n_channels], #1, 11f\n" + "ld1 { v24.h }[6], [x13], #0x2\n" + "ld1 { v22.h }[6], [x12], #0x2\n" + "ld1 { v20.h }[6], [x11], #0x2\n" + "ld1 { v16.h }[6], [x10], #0x2\n" + "ld1 { v19.h }[6], [x9], #0x2\n" + "ld1 { v0.h }[6], [x28], #0x2\n" + "ld1 { v18.h }[6], [x27], #0x2\n" + "ld1 { v17.h }[6], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[14], [x13], #0x1\n" + "ld1 { v22.b }[14], [x12], #0x1\n" + "ld1 { v20.b }[14], [x11], #0x1\n" + "ld1 { v16.b }[14], [x10], #0x1\n" + "ld1 { v19.b }[14], [x9], #0x1\n" + "ld1 { v0.b }[14], [x28], #0x1\n" + "ld1 { v18.b }[14], [x27], #0x1\n" + "ld1 { v17.b }[14], [x26], #0x1\n" + "b 18f\n" + "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[12], [x13], #0x1\n" + "ld1 { v22.b }[12], [x12], #0x1\n" + "ld1 { v20.b }[12], [x11], #0x1\n" + "ld1 { v16.b }[12], [x10], #0x1\n" + "ld1 { v19.b }[12], [x9], #0x1\n" + "ld1 { v0.b }[12], [x28], #0x1\n" + "ld1 { v18.b }[12], [x27], #0x1\n" + "ld1 { v17.b }[12], [x26], #0x1\n" + "b 18f\n" + "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset + "tbz %x[n_channels], #1, 13f\n" + "ld1 { v24.h }[4], [x13], #0x2\n" + "ld1 { v22.h }[4], [x12], #0x2\n" + "ld1 { v20.h }[4], [x11], #0x2\n" + "ld1 { v16.h }[4], [x10], #0x2\n" + "ld1 { v19.h }[4], [x9], #0x2\n" + "ld1 { v0.h }[4], [x28], #0x2\n" + "ld1 { v18.h }[4], [x27], #0x2\n" + "ld1 { v17.h }[4], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[10], [x13], #0x1\n" + "ld1 { v22.b }[10], [x12], #0x1\n" + "ld1 { v20.b }[10], [x11], #0x1\n" + "ld1 { v16.b }[10], [x10], #0x1\n" + "ld1 { v19.b }[10], [x9], #0x1\n" + "ld1 { v0.b }[10], [x28], #0x1\n" + "ld1 { v18.b }[10], [x27], #0x1\n" + "ld1 { v17.b }[10], [x26], #0x1\n" + "b 18f\n" + "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[8], [x13], #0x1\n" + "ld1 { v22.b }[8], [x12], #0x1\n" + "ld1 { v20.b }[8], [x11], #0x1\n" + "ld1 { v16.b }[8], [x10], #0x1\n" + "ld1 { v19.b }[8], [x9], #0x1\n" + "ld1 { v0.b }[8], [x28], #0x1\n" + "ld1 { v18.b }[8], [x27], #0x1\n" + "ld1 { v17.b }[8], [x26], #0x1\n" + "b 18f\n" + "14:" // Oddments: Load (B): Bit 3: Unset + "tbz %x[n_channels], #2, 16f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "ld1 { v22.s }[0], [x12], #0x4\n" + "ld1 { v20.s }[0], [x11], #0x4\n" + "ld1 { v16.s }[0], [x10], #0x4\n" + "ld1 { v19.s }[0], [x9], #0x4\n" + "ld1 { v0.s }[0], [x28], #0x4\n" + "ld1 { v18.s }[0], [x27], #0x4\n" + "ld1 { v17.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #1, 15f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "ld1 { v22.h }[2], [x12], #0x2\n" + "ld1 { v20.h }[2], [x11], #0x2\n" + "ld1 { v16.h }[2], [x10], #0x2\n" + "ld1 { v19.h }[2], [x9], #0x2\n" + "ld1 { v0.h }[2], [x28], #0x2\n" + "ld1 { v18.h }[2], [x27], #0x2\n" + "ld1 { v17.h }[2], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[6], [x13], #0x1\n" + "ld1 { v22.b }[6], [x12], #0x1\n" + "ld1 { v20.b }[6], [x11], #0x1\n" + "ld1 { v16.b }[6], [x10], #0x1\n" + "ld1 { v19.b }[6], [x9], #0x1\n" + "ld1 { v0.b }[6], [x28], #0x1\n" + "ld1 { v18.b }[6], [x27], #0x1\n" + "ld1 { v17.b }[6], [x26], #0x1\n" + "b 18f\n" + "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[4], [x13], #0x1\n" + "ld1 { v22.b }[4], [x12], #0x1\n" + "ld1 { v20.b }[4], [x11], #0x1\n" + "ld1 { v16.b }[4], [x10], #0x1\n" + "ld1 { v19.b }[4], [x9], #0x1\n" + "ld1 { v0.b }[4], [x28], #0x1\n" + "ld1 { v18.b }[4], [x27], #0x1\n" + "ld1 { v17.b }[4], [x26], #0x1\n" + "b 18f\n" + "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset + "tbz %x[n_channels], #1, 17f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "ld1 { v22.h }[0], [x12], #0x2\n" + "ld1 { v20.h }[0], [x11], #0x2\n" + "ld1 { v16.h }[0], [x10], #0x2\n" + "ld1 { v19.h }[0], [x9], #0x2\n" + "ld1 { v0.h }[0], [x28], #0x2\n" + "ld1 { v18.h }[0], [x27], #0x2\n" + "ld1 { v17.h }[0], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[2], [x13], #0x1\n" + "ld1 { v22.b }[2], [x12], #0x1\n" + "ld1 { v20.b }[2], [x11], #0x1\n" + "ld1 { v16.b }[2], [x10], #0x1\n" + "ld1 { v19.b }[2], [x9], #0x1\n" + "ld1 { v0.b }[2], [x28], #0x1\n" + "ld1 { v18.b }[2], [x27], #0x1\n" + "ld1 { v17.b }[2], [x26], #0x1\n" + "b 18f\n" + "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[0], [x13], #0x1\n" + "ld1 { v22.b }[0], [x12], #0x1\n" + "ld1 { v20.b }[0], [x11], #0x1\n" + "ld1 { v16.b }[0], [x10], #0x1\n" + "ld1 { v19.b }[0], [x9], #0x1\n" + "ld1 { v0.b }[0], [x28], #0x1\n" + "ld1 { v18.b }[0], [x27], #0x1\n" + "ld1 { v17.b }[0], [x26], #0x1\n" + "18:" // Oddments: Load (B): Bit 3: End + "zip1 v7.16b, v27.16b, v25.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "cmp x19, #0x4\n" + "zip2 v5.16b, v27.16b, v25.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "zip1 v8.16b, v1.16b, v23.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "zip2 v3.16b, v1.16b, v23.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "zip1 v2.16b, v31.16b, v21.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "zip2 v4.16b, v31.16b, v21.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "zip1 v1.16b, v28.16b, v26.16b\n" + "zip2 v31.16b, v28.16b, v26.16b\n" + "zip1 v28.16b, v24.16b, v20.16b\n" + "zip2 v26.16b, v24.16b, v20.16b\n" + "zip1 v24.16b, v22.16b, v16.16b\n" + "zip2 v22.16b, v22.16b, v16.16b\n" + "zip1 v20.16b, v19.16b, v18.16b\n" + "zip2 v19.16b, v19.16b, v18.16b\n" + "zip1 v18.16b, v0.16b, v17.16b\n" + "zip2 v17.16b, v0.16b, v17.16b\n" + "zip1 v6.16b, v7.16b, v8.16b\n" + "zip2 v8.16b, v7.16b, v8.16b\n" + "zip1 v7.16b, v5.16b, v3.16b\n" + "str q7, [SP, #0x0]\n" + "zip2 v5.16b, v5.16b, v3.16b\n" + "str q5, [SP, #0x10]\n" + "zip1 v3.16b, v2.16b, v1.16b\n" + "zip2 v2.16b, v2.16b, v1.16b\n" + "zip1 v1.16b, v4.16b, v31.16b\n" + "str q1, [SP, #0x20]\n" + "zip2 v16.16b, v4.16b, v31.16b\n" + "str q16, [SP, #0x30]\n" + "zip1 v31.16b, v28.16b, v24.16b\n" + "zip2 v28.16b, v28.16b, v24.16b\n" + "zip1 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x40]\n" + "zip2 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x50]\n" + "zip1 v26.16b, v20.16b, v18.16b\n" + "zip2 v24.16b, v20.16b, v18.16b\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x60]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x70]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v19.16b, v30.16b\n" + ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + "movi v15.4s, #0x0\n" + ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + "mov v17.16b, v15.16b\n" + ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + "movi v10.4s, #0x0\n" + ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v20.4s, v20.4s, v9.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v22.4s, v22.4s, v13.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 19f\n" + "str s30, [x25, x23]\n" + "str s22, [x24, x23]\n" + "str s20, [x22, x23]\n" + "str s19, [x21, x23]\n" + "b 22f\n" + "19:" // Oddments: Unroll 0: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 20f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 21f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 21f\n" + "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset + "tbz x19, #0, 21f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End + + "22:" // Oddments: Unroll 0: After oddment store + "add x23, x23, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "movi v15.4s, #0x0\n" + "ldr q30, [%x[params], #0x0]\n" + ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "cmp x19, #0x4\n" + "movi v10.4s, #0x0\n" + "ldr q27, [%x[params], #0x20]\n" + "ldr q25, [%x[params], #0x30]\n" + "mov v22.16b, v30.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "mov v20.16b, v30.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "mov v19.16b, v30.16b\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v30.4s, v30.4s, v12.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 23f\n" + "str s30, [x25, x23]\n" + "str s22, [x24, x23]\n" + "str s20, [x22, x23]\n" + "str s19, [x21, x23]\n" + "b 26f\n" + "23:" // Oddments: Unroll 1: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 24f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 25f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 25f\n" + "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset + "tbz x19, #0, 25f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End + + "26:" // Oddments: Unroll 1: After oddment store + "add x23, x23, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "movi v15.4s, #0x0\n" + "ldr q6, [SP, #0x0]\n" + "movi v10.4s, #0x0\n" + "ldr q3, [SP, #0x20]\n" + "cmp x19, #0x4\n" + ".inst 0x4e83956f // sdot v15.4s, v11.16b, v3.16b\n" + "ldr q31, [SP, #0x40]\n" + "ldr q26, [SP, #0x60]\n" + ".inst 0x4e9f956f // sdot v15.4s, v11.16b, v31.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "ldr q29, [%x[params], #0x10]\n" + "mov v22.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v20.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "mov v19.16b, v30.16b\n" + "ldr q23, [%x[params], #0x40]\n" + ".inst 0x4e8697be // sdot v30.4s, v29.16b, v6.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x4e86956f // sdot v15.4s, v11.16b, v6.16b\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8697b6 // sdot v22.4s, v29.16b, v6.16b\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + ".inst 0x4e83956a // sdot v10.4s, v11.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + ".inst 0x4e9f956a // sdot v10.4s, v11.16b, v31.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e86956a // sdot v10.4s, v11.16b, v6.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e9a9571 // sdot v17.4s, v11.16b, v26.16b\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v30.4s, v30.4s, v12.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 27f\n" + "str s30, [x25, x23]\n" + "str s22, [x24, x23]\n" + "str s20, [x22, x23]\n" + "str s19, [x21, x23]\n" + "b 30f\n" + "27:" // Oddments: Unroll 2: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 28f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 29f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 29f\n" + "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset + "tbz x19, #0, 29f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End + + "30:" // Oddments: Unroll 2: After oddment store + "add x23, x23, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "movi v15.4s, #0x0\n" + "ldr q8, [SP, #0x10]\n" + "movi v10.4s, #0x0\n" + "ldr q2, [SP, #0x30]\n" + "ldr q28, [SP, #0x50]\n" + ".inst 0x4e82956f // sdot v15.4s, v11.16b, v2.16b\n" + "ldr q24, [SP, #0x70]\n" + "ldr q30, [%x[params], #0x0]\n" + "mov v22.16b, v30.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "mov v20.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v19.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + ".inst 0x4e9c956f // sdot v15.4s, v11.16b, v28.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "ldr q21, [%x[params], #0x50]\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x4e88956f // sdot v15.4s, v11.16b, v8.16b\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + ".inst 0x4e82956a // sdot v10.4s, v11.16b, v2.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + ".inst 0x4e9c956a // sdot v10.4s, v11.16b, v28.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x4e88956a // sdot v10.4s, v11.16b, v8.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x4e989571 // sdot v17.4s, v11.16b, v24.16b\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v30.4s, v30.4s, v12.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "31:" // Oddments: Unroll 3: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 32f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 33f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 33f\n" + "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset + "tbz x19, #0, 33f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End + + "34:" // End + "add SP, SP, #0x80\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..76c927abcb --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + +struct a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + + kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + + a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..3001276fb5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1192 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + int8_t *const *const outptrs; + const int8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const int8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x17, #0x0\n" + "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "mov x15, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x14, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x12, x8, #0x3\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v14.16b }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v9.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v15.4s }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v24.4s }, [x20]\n" + "ld1r { v12.4s }, [x19]\n" + "ldp x10, x9, [x21, #0x0]\n" + "ldp x28, x27, [x21, #0x10]\n" + "cbz x12, 3f\n" + "subs x12, x12, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q13, [x19, #0x0]\n" + "mov v17.16b, v13.16b\n" + "ldr q19, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v13.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v23.16b, v13.16b\n" + "ldr d0, [x16, #0x0]\n" + "ssubl v0.8h, v0.8b, v9.8b\n" + "mov v25.16b, v19.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v21.16b, v19.16b\n" + "ldr d2, [x16, #0x10]\n" + "ssubl v1.8h, v1.8b, v9.8b\n" + "mov v20.16b, v19.16b\n" + "ldr d3, [x16, #0x18]\n" + "ldr d4, [x16, #0x20]\n" + "ssubl v2.8h, v2.8b, v9.8b\n" + "ldr d5, [x16, #0x28]\n" + "ssubl v3.8h, v3.8b, v9.8b\n" + "ldr d6, [x16, #0x30]\n" + "ldr d7, [x16, #0x38]\n" + "ssubl v4.8h, v4.8b, v9.8b\n" + "ldr d8, [x16, #0x40]\n" + "ssubl v5.8h, v5.8b, v9.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "ssubl v6.8h, v6.8b, v9.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "ssubl v7.8h, v7.8b, v9.8b\n" + "ssubl v8.8h, v8.8b, v9.8b\n" + "ldr x19, [x14, #0x20]\n" + "ldr d31, [x23, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "ldr d30, [x22, x17]\n" + "ldr d29, [x21, x17]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "ldr d28, [x20, x17]\n" + "ldr d27, [x19, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "ssubl v27.8h, v27.8b, v14.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v13.4s, v31.4h, v4.4h\n" + "ldr x21, [x14, #0x28]\n" + "add x16, x16, #0x48\n" + "smlal2 v19.4s, v31.8h, v4.8h\n" + "ldr x20, [x14, #0x30]\n" + "subs x12, x12, #0x1\n" + "smlal v17.4s, v31.4h, v3.4h\n" + "ldr x26, [x14, #0x38]\n" + "smlal2 v25.4s, v31.8h, v3.8h\n" + "ldr x25, [x14, #0x40]\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "ldr x19, [x14, #0x48]\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "ldr x24, [x14, #0x50]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "ldr x23, [x14, #0x58]\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x22, [x14, #0x60]\n" + "smlal2 v19.4s, v30.8h, v0.8h\n" + "ldr d30, [x19, x17]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "ldr x21, [x14, #0x68]\n" + "smlal2 v25.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "ldr x20, [x14, #0x70]\n" + "smlal2 v19.4s, v28.8h, v5.8h\n" + "ldr x19, [x14, #0x78]\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "ldr q26, [x13, #0x0]\n" + "smlal2 v25.4s, v28.8h, v4.8h\n" + "ldr q10, [x11, #0x0]\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "ldr q11, [x13, #0x10]\n" + "add x13, x13, #0x20\n" + "smlal2 v21.4s, v28.8h, v2.8h\n" + "ldr q18, [x11, #0x10]\n" + "add x11, x11, #0x20\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x17]\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v21.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v19.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v27.4h, v6.4h\n" + "smlal2 v25.4s, v27.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal v23.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v19.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v25.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x17]\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" + "smlal2 v19.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v25.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v8.4h\n" + "smlal2 v19.4s, v30.8h, v8.8h\n" + "smlal v17.4s, v30.4h, v7.4h\n" + "smlal2 v25.4s, v30.8h, v7.8h\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v21.4s, v30.8h, v5.8h\n" + "smlal v23.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x17]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal v13.4s, v29.4h, v3.4h\n" + "smlal2 v19.4s, v29.8h, v3.8h\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v5.4h\n" + "smlal2 v25.4s, v28.8h, v5.8h\n" + "smlal v23.4s, v28.4h, v2.4h\n" + "smlal2 v20.4s, v28.8h, v2.8h\n" + "ldr d28, [x19, x17]\n" + "add x17, x17, #0x8\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v19.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v8.4h\n" + "smlal2 v25.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v5.4h\n" + "smlal2 v20.4s, v30.8h, v5.8h\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v21.4s, v29.8h, v7.8h\n" + "smlal v23.4s, v29.4h, v6.4h\n" + "smlal2 v20.4s, v29.8h, v6.8h\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v21.4s, v28.8h, v8.8h\n" + "smlal v23.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "sqrdmulh v13.4s, v13.4s, v26.4s\n" + "sqrdmulh v19.4s, v19.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v26.4s\n" + "sqrdmulh v25.4s, v25.4s, v11.4s\n" + "and v22.16b, v13.16b, v10.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v28.16b, v19.16b, v18.16b\n" + "and v3.16b, v17.16b, v10.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v6.16b, v25.16b, v18.16b\n" + "sqrdmulh v16.4s, v16.4s, v26.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v11.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqrdmulh v23.4s, v23.4s, v26.4s\n" + "and v0.16b, v16.16b, v10.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "and v29.16b, v21.16b, v18.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "add v13.4s, v13.4s, v15.4s\n" + "srshl v19.4s, v19.4s, v18.4s\n" + "srshl v17.4s, v17.4s, v10.4s\n" + "srshl v25.4s, v25.4s, v18.4s\n" + "smin v13.4s, v13.4s, v12.4s\n" + "add v19.4s, v19.4s, v15.4s\n" + "add v17.4s, v17.4s, v15.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smin v19.4s, v19.4s, v12.4s\n" + "smin v17.4s, v17.4s, v12.4s\n" + "add v25.4s, v25.4s, v15.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smin v25.4s, v25.4s, v12.4s\n" + "uzp1 v13.16b, v13.16b, v19.16b\n" + "sqadd v16.4s, v16.4s, v0.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "str d13, [x10, x15]\n" + "smax v25.4s, v25.4s, v24.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "srshl v16.4s, v16.4s, v10.4s\n" + "and v3.16b, v23.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v25.16b\n" + "add v16.4s, v16.4s, v15.4s\n" + "srshl v21.4s, v21.4s, v18.4s\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x9, x15]\n" + "smin v16.4s, v16.4s, v12.4s\n" + "sqrdmulh v20.4s, v20.4s, v11.4s\n" + "add v21.4s, v21.4s, v15.4s\n" + "sqadd v23.4s, v23.4s, v3.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smin v21.4s, v21.4s, v12.4s\n" + "and v25.16b, v20.16b, v18.16b\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smax v21.4s, v21.4s, v24.4s\n" + "srshl v23.4s, v23.4s, v10.4s\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v23.4s, v23.4s, v15.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x28, x15]\n" + "smin v23.4s, v23.4s, v12.4s\n" + "sqadd v20.4s, v20.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "srshl v20.4s, v20.4s, v18.4s\n" + "add v20.4s, v20.4s, v15.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d23, [x27, x15]\n" + "add x15, x15, #0x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q13, [x19, #0x0]\n" + "mov v17.16b, v13.16b\n" + "ldr q19, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v13.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v23.16b, v13.16b\n" + "ldr d0, [x16, #0x0]\n" + "ssubl v0.8h, v0.8b, v9.8b\n" + "mov v25.16b, v19.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v21.16b, v19.16b\n" + "ldr d2, [x16, #0x10]\n" + "ssubl v1.8h, v1.8b, v9.8b\n" + "mov v20.16b, v19.16b\n" + "ldr d3, [x16, #0x18]\n" + "ldr d4, [x16, #0x20]\n" + "ssubl v2.8h, v2.8b, v9.8b\n" + "ldr d5, [x16, #0x28]\n" + "ssubl v3.8h, v3.8b, v9.8b\n" + "ldr d6, [x16, #0x30]\n" + "ldr d7, [x16, #0x38]\n" + "ssubl v4.8h, v4.8b, v9.8b\n" + "ldr d8, [x16, #0x40]\n" + "ssubl v5.8h, v5.8b, v9.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "ssubl v6.8h, v6.8b, v9.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "ssubl v7.8h, v7.8b, v9.8b\n" + "ssubl v8.8h, v8.8b, v9.8b\n" + "ldr x19, [x14, #0x20]\n" + "ldr d31, [x23, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "ldr d30, [x22, x17]\n" + "ldr d29, [x21, x17]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "ldr d28, [x20, x17]\n" + "ldr d27, [x19, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "ssubl v27.8h, v27.8b, v14.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v13.4s, v31.4h, v4.4h\n" + "ldr x21, [x14, #0x28]\n" + "tst x8, #0x7\n" + "smlal2 v19.4s, v31.8h, v4.8h\n" + "ldr x20, [x14, #0x30]\n" + "smlal v17.4s, v31.4h, v3.4h\n" + "ldr x26, [x14, #0x38]\n" + "smlal2 v25.4s, v31.8h, v3.8h\n" + "ldr x25, [x14, #0x40]\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "ldr x19, [x14, #0x48]\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "ldr x24, [x14, #0x50]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "ldr x23, [x14, #0x58]\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x22, [x14, #0x60]\n" + "smlal2 v19.4s, v30.8h, v0.8h\n" + "ldr d30, [x19, x17]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "ldr x21, [x14, #0x68]\n" + "smlal2 v25.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "ldr x20, [x14, #0x70]\n" + "smlal2 v19.4s, v28.8h, v5.8h\n" + "ldr x19, [x14, #0x78]\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "ldr q26, [x13, #0x0]\n" + "smlal2 v25.4s, v28.8h, v4.8h\n" + "ldr q10, [x11, #0x0]\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "ldr q11, [x13, #0x10]\n" + "add x13, x13, #0x20\n" + "smlal2 v21.4s, v28.8h, v2.8h\n" + "ldr q18, [x11, #0x10]\n" + "add x11, x11, #0x20\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x17]\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v21.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v19.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v27.4h, v6.4h\n" + "smlal2 v25.4s, v27.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal v23.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v19.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v25.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x17]\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" + "smlal2 v19.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v25.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x17]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v8.4h\n" + "smlal2 v19.4s, v30.8h, v8.8h\n" + "smlal v17.4s, v30.4h, v7.4h\n" + "smlal2 v25.4s, v30.8h, v7.8h\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v21.4s, v30.8h, v5.8h\n" + "smlal v23.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x17]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal v13.4s, v29.4h, v3.4h\n" + "smlal2 v19.4s, v29.8h, v3.8h\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x17]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v5.4h\n" + "smlal2 v25.4s, v28.8h, v5.8h\n" + "smlal v23.4s, v28.4h, v2.4h\n" + "smlal2 v20.4s, v28.8h, v2.8h\n" + "ldr d28, [x19, x17]\n" + "add x17, x17, #0x8\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v19.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v8.4h\n" + "smlal2 v25.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v5.4h\n" + "smlal2 v20.4s, v30.8h, v5.8h\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v21.4s, v29.8h, v7.8h\n" + "smlal v23.4s, v29.4h, v6.4h\n" + "smlal2 v20.4s, v29.8h, v6.8h\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v21.4s, v28.8h, v8.8h\n" + "smlal v23.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "sqrdmulh v13.4s, v13.4s, v26.4s\n" + "sqrdmulh v19.4s, v19.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v26.4s\n" + "sqrdmulh v25.4s, v25.4s, v11.4s\n" + "and v22.16b, v13.16b, v10.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v28.16b, v19.16b, v18.16b\n" + "and v3.16b, v17.16b, v10.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v6.16b, v25.16b, v18.16b\n" + "sqrdmulh v16.4s, v16.4s, v26.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v11.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqrdmulh v23.4s, v23.4s, v26.4s\n" + "and v0.16b, v16.16b, v10.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "and v29.16b, v21.16b, v18.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "add v13.4s, v13.4s, v15.4s\n" + "srshl v19.4s, v19.4s, v18.4s\n" + "srshl v17.4s, v17.4s, v10.4s\n" + "srshl v25.4s, v25.4s, v18.4s\n" + "smin v13.4s, v13.4s, v12.4s\n" + "add v19.4s, v19.4s, v15.4s\n" + "add v17.4s, v17.4s, v15.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smin v19.4s, v19.4s, v12.4s\n" + "smin v17.4s, v17.4s, v12.4s\n" + "add v25.4s, v25.4s, v15.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smin v25.4s, v25.4s, v12.4s\n" + "uzp1 v13.16b, v13.16b, v19.16b\n" + "sqadd v16.4s, v16.4s, v0.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "str d13, [x10, x15]\n" + "smax v25.4s, v25.4s, v24.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "srshl v16.4s, v16.4s, v10.4s\n" + "and v3.16b, v23.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v25.16b\n" + "add v16.4s, v16.4s, v15.4s\n" + "srshl v21.4s, v21.4s, v18.4s\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x9, x15]\n" + "smin v16.4s, v16.4s, v12.4s\n" + "sqrdmulh v20.4s, v20.4s, v11.4s\n" + "add v21.4s, v21.4s, v15.4s\n" + "sqadd v23.4s, v23.4s, v3.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smin v21.4s, v21.4s, v12.4s\n" + "and v25.16b, v20.16b, v18.16b\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smax v21.4s, v21.4s, v24.4s\n" + "srshl v23.4s, v23.4s, v10.4s\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v23.4s, v23.4s, v15.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x28, x15]\n" + "smin v23.4s, v23.4s, v12.4s\n" + "sqadd v20.4s, v20.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "srshl v20.4s, v20.4s, v18.4s\n" + "add v20.4s, v20.4s, v15.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d23, [x27, x15]\n" + "add x15, x15, #0x8\n" + "beq 64f\n" + "add x16, x16, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x8, #2, 5f\n" + "ld1 { v13.4s }, [x19], #0x10\n" + "tbz x8, #1, 4f\n" + "ld1 { v19.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v19.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v19.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x8, #1, 6f\n" + "ld1 { v13.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v13.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v17.16b, v13.16b\n" + "ldr d0, [x16, #0x0]\n" + "mov v25.16b, v19.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v16.16b, v13.16b\n" + "ldr d2, [x16, #0x10]\n" + "mov v21.16b, v19.16b\n" + "ldr d3, [x16, #0x18]\n" + "mov v23.16b, v13.16b\n" + "ldr d4, [x16, #0x20]\n" + "ssubl v0.8h, v0.8b, v9.8b\n" + "mov v20.16b, v19.16b\n" + "ldr d5, [x16, #0x28]\n" + "ssubl v1.8h, v1.8b, v9.8b\n" + "ldr d6, [x16, #0x30]\n" + "ssubl v2.8h, v2.8b, v9.8b\n" + "ldr d7, [x16, #0x38]\n" + "ssubl v3.8h, v3.8b, v9.8b\n" + "ldr d8, [x16, #0x40]\n" + "ssubl v4.8h, v4.8b, v9.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "ssubl v5.8h, v5.8b, v9.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "ssubl v6.8h, v6.8b, v9.8b\n" + "ssubl v7.8h, v7.8b, v9.8b\n" + "ldr x19, [x14, #0x20]\n" + "ssubl v8.8h, v8.8b, v9.8b\n" + "add x23, x23, x17\n" + "add x22, x22, x17\n" + "add x21, x21, x17\n" + "add x20, x20, x17\n" + "add x19, x19, x17\n" + "tbz x8, #2, 9f\n" + "ld1 { v31.s }[0], [x23], #0x4\n" + "ld1 { v30.s }[0], [x22], #0x4\n" + "ld1 { v29.s }[0], [x21], #0x4\n" + "ld1 { v28.s }[0], [x20], #0x4\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 8f\n" + "ld1 { v31.h }[2], [x23], #0x2\n" + "ld1 { v30.h }[2], [x22], #0x2\n" + "ld1 { v29.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x20], #0x2\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[6], [x23]\n" + "ld1 { v30.b }[6], [x22]\n" + "ld1 { v29.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x20]\n" + "ld1 { v27.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[4], [x23]\n" + "ld1 { v30.b }[4], [x22]\n" + "ld1 { v29.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x20]\n" + "ld1 { v27.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x8, #1, 10f\n" + "ld1 { v31.h }[0], [x23], #0x2\n" + "ld1 { v30.h }[0], [x22], #0x2\n" + "ld1 { v29.h }[0], [x21], #0x2\n" + "ld1 { v28.h }[0], [x20], #0x2\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[2], [x23]\n" + "ld1 { v30.b }[2], [x22]\n" + "ld1 { v29.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x20]\n" + "ld1 { v27.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[0], [x23]\n" + "ld1 { v30.b }[0], [x22]\n" + "ld1 { v29.b }[0], [x21]\n" + "ld1 { v28.b }[0], [x20]\n" + "ld1 { v27.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ldr x21, [x14, #0x28]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v4.4h\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal2 v19.4s, v31.8h, v4.8h\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v31.4h, v3.4h\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v25.4s, v31.8h, v3.8h\n" + "ssubl v27.8h, v27.8b, v14.8b\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "add x21, x21, x17\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "smlal2 v19.4s, v30.8h, v0.8h\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v25.4s, v29.8h, v2.8h\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "smlal2 v19.4s, v28.8h, v5.8h\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal2 v25.4s, v28.8h, v4.8h\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "smlal2 v21.4s, v28.8h, v2.8h\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v31.s }[0], [x21], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v31.h }[2], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[6], [x21]\n" + "b 15f\n" + "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[4], [x21]\n" + "b 15f\n" + "13:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x8, #1, 14f\n" + "ld1 { v31.h }[0], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[2], [x21]\n" + "b 15f\n" + "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[0], [x21]\n" + "15:" // Oddments: Load (3, 0): Bit 2: End + "smlal v13.4s, v27.4h, v7.4h\n" + "ldr x20, [x14, #0x30]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v19.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v27.4h, v6.4h\n" + "add x20, x20, x17\n" + "smlal2 v25.4s, v27.8h, v6.8h\n" + "smlal v23.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v21.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "tbz x8, #2, 17f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 19f\n" + "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 19f\n" + "17:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x8, #1, 18f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 19f\n" + "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[0], [x20]\n" + "19:" // Oddments: Load (3, 3): Bit 2: End + "ldr x26, [x14, #0x38]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "add x26, x26, x17\n" + "tbz x8, #2, 21f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x8, #1, 20f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[6], [x26]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[4], [x26]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 1): Bit 2: Unset + "tbz x8, #1, 22f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[2], [x26]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[0], [x26]\n" + "23:" // Oddments: Load (0, 1): Bit 2: End + "ldr x25, [x14, #0x40]\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v19.4s, v28.8h, v1.8h\n" + "add x25, x25, x17\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v25.4s, v28.8h, v0.8h\n" + "tbz x8, #2, 25f\n" + "ld1 { v31.s }[0], [x25], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v31.h }[2], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[6], [x25]\n" + "b 27f\n" + "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[4], [x25]\n" + "b 27f\n" + "25:" // Oddments: Load (0, 2): Bit 2: Unset + "tbz x8, #1, 26f\n" + "ld1 { v31.h }[0], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[2], [x25]\n" + "b 27f\n" + "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[0], [x25]\n" + "27:" // Oddments: Load (0, 2): Bit 2: End + "ldr x19, [x14, #0x48]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" + "smlal2 v19.4s, v31.8h, v2.8h\n" + "add x19, x19, x17\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v25.4s, v31.8h, v1.8h\n" + "tbz x8, #2, 29f\n" + "ld1 { v30.s }[0], [x19], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v30.h }[2], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[6], [x19]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[4], [x19]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x8, #1, 30f\n" + "ld1 { v30.h }[0], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[2], [x19]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[0], [x19]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "ldr x24, [x14, #0x50]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v8.4h\n" + "smlal2 v19.4s, v30.8h, v8.8h\n" + "add x24, x24, x17\n" + "smlal v17.4s, v30.4h, v7.4h\n" + "smlal2 v25.4s, v30.8h, v7.8h\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v21.4s, v30.8h, v5.8h\n" + "smlal v23.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 35f\n" + "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 35f\n" + "33:" // Oddments: Load (1, 0): Bit 2: Unset + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 35f\n" + "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x24]\n" + "35:" // Oddments: Load (1, 0): Bit 2: End + "ldr x23, [x14, #0x58]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v13.4s, v29.4h, v3.4h\n" + "smlal2 v19.4s, v29.8h, v3.8h\n" + "add x23, x23, x17\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "tbz x8, #2, 37f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[6], [x23]\n" + "b 39f\n" + "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[4], [x23]\n" + "b 39f\n" + "37:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x8, #1, 38f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[2], [x23]\n" + "b 39f\n" + "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[0], [x23]\n" + "39:" // Oddments: Load (1, 3): Bit 2: End + "ldr x22, [x14, #0x60]\n" + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v5.4h\n" + "smlal2 v25.4s, v28.8h, v5.8h\n" + "add x22, x22, x17\n" + "smlal v23.4s, v28.4h, v2.4h\n" + "smlal2 v20.4s, v28.8h, v2.8h\n" + "tbz x8, #2, 41f\n" + "ld1 { v31.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v31.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x8, #1, 42f\n" + "ld1 { v31.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 0): Bit 2: End + "ldr x21, [x14, #0x68]\n" + "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal2 v19.4s, v31.8h, v6.8h\n" + "add x21, x21, x17\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "tbz x8, #2, 45f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x8, #1, 46f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[0], [x21]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "ldr x20, [x14, #0x70]\n" + "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal v17.4s, v30.4h, v8.4h\n" + "smlal2 v25.4s, v30.8h, v8.8h\n" + "add x20, x20, x17\n" + "smlal v23.4s, v30.4h, v5.4h\n" + "smlal2 v20.4s, v30.8h, v5.8h\n" + "tbz x8, #2, 49f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x8, #1, 50f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[0], [x20]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "ldr x19, [x14, #0x78]\n" + "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v21.4s, v29.8h, v7.8h\n" + "add x19, x19, x17\n" + "smlal v23.4s, v29.4h, v6.4h\n" + "smlal2 v20.4s, v29.8h, v6.8h\n" + "tbz x8, #2, 53f\n" + "ld1 { v28.s }[0], [x19], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v28.h }[2], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[6], [x19]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[4], [x19]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x8, #1, 54f\n" + "ld1 { v28.h }[0], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[2], [x19]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[0], [x19]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v21.4s, v28.8h, v8.8h\n" + "smlal v23.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "tbz x8, #2, 57f\n" + "ld1 { v26.4s }, [x13], #0x10\n" + "ld1 { v10.4s }, [x11], #0x10\n" + "tbz x8, #1, 56f\n" + "ld1 { v11.d }[0], [x13], #0x8\n" + "ld1 { v18.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v18.s }[2], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v11.s }[0], [x13]\n" + "ld1 { v18.s }[0], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load requant params: Bit 2: Unset + "tbz x8, #1, 58f\n" + "ld1 { v26.d }[0], [x13], #0x8\n" + "ld1 { v10.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v26.s }[2], [x13]\n" + "ld1 { v10.s }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v26.s }[0], [x13]\n" + "ld1 { v10.s }[0], [x11]\n" + "59:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v13.4s, v13.4s, v26.4s\n" + "add x10, x10, x15\n" + "sqrdmulh v19.4s, v19.4s, v11.4s\n" + "add x9, x9, x15\n" + "sqrdmulh v17.4s, v17.4s, v26.4s\n" + "add x28, x28, x15\n" + "sqrdmulh v25.4s, v25.4s, v11.4s\n" + "add x27, x27, x15\n" + "sqrdmulh v16.4s, v16.4s, v26.4s\n" + "and v22.16b, v13.16b, v10.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v28.16b, v19.16b, v18.16b\n" + "and v3.16b, v17.16b, v10.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v6.16b, v25.16b, v18.16b\n" + "and v0.16b, v16.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v11.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqrdmulh v23.4s, v23.4s, v26.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v11.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "srshl v13.4s, v13.4s, v10.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "srshl v19.4s, v19.4s, v18.4s\n" + "srshl v17.4s, v17.4s, v10.4s\n" + "add v13.4s, v13.4s, v15.4s\n" + "srshl v25.4s, v25.4s, v18.4s\n" + "add v19.4s, v19.4s, v15.4s\n" + "smin v13.4s, v13.4s, v12.4s\n" + "add v17.4s, v17.4s, v15.4s\n" + "smin v19.4s, v19.4s, v12.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smin v17.4s, v17.4s, v12.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "add v25.4s, v25.4s, v15.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "uzp1 v13.16b, v13.16b, v19.16b\n" + "smin v25.4s, v25.4s, v12.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "sqadd v16.4s, v16.4s, v0.4s\n" + "smax v25.4s, v25.4s, v24.4s\n" + "and v29.16b, v21.16b, v18.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v25.16b\n" + "srshl v16.4s, v16.4s, v10.4s\n" + "and v3.16b, v23.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "add v16.4s, v16.4s, v15.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "and v25.16b, v20.16b, v18.16b\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smin v16.4s, v16.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v18.4s\n" + "sqadd v23.4s, v23.4s, v3.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "add v21.4s, v21.4s, v15.4s\n" + "srshl v23.4s, v23.4s, v10.4s\n" + "sqadd v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v12.4s\n" + "add v23.4s, v23.4s, v15.4s\n" + "srshl v20.4s, v20.4s, v18.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smin v23.4s, v23.4s, v12.4s\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v20.4s, v20.4s, v15.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "smax v23.4s, v23.4s, v24.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "tbz x8, #2, 61f\n" + "st1 { v13.s }[0], [x10], #0x4\n" + "st1 { v17.s }[0], [x9], #0x4\n" + "st1 { v16.s }[0], [x28], #0x4\n" + "st1 { v23.s }[0], [x27], #0x4\n" + "tbz x8, #1, 60f\n" + "st1 { v13.h }[2], [x10], #0x2\n" + "st1 { v17.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v23.h }[2], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v13.b }[6], [x10], #0x1\n" + "st1 { v17.b }[6], [x9], #0x1\n" + "st1 { v16.b }[6], [x28], #0x1\n" + "st1 { v23.b }[6], [x27], #0x1\n" + "b 63f\n" + "60:" // Oddments: Bit 2: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v13.b }[4], [x10], #0x1\n" + "st1 { v17.b }[4], [x9], #0x1\n" + "st1 { v16.b }[4], [x28], #0x1\n" + "st1 { v23.b }[4], [x27], #0x1\n" + "b 63f\n" + "61:" // Oddments: Bit 2: Unset + "tbz x8, #1, 62f\n" + "st1 { v13.h }[0], [x10], #0x2\n" + "st1 { v17.h }[0], [x9], #0x2\n" + "st1 { v16.h }[0], [x28], #0x2\n" + "st1 { v23.h }[0], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v13.b }[2], [x10], #0x1\n" + "st1 { v17.b }[2], [x9], #0x1\n" + "st1 { v16.b }[2], [x28], #0x1\n" + "st1 { v23.b }[2], [x27], #0x1\n" + "b 63f\n" + "62:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v13.b }[0], [x10], #0x1\n" + "st1 { v17.b }[0], [x9], #0x1\n" + "st1 { v16.b }[0], [x28], #0x1\n" + "st1 { v23.b }[0], [x27], #0x1\n" + "63:" // Oddments: Bit 2: End + + "64:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..b20759eec4 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + +struct a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + + kern_type kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + + a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..3b3d9c8946 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + int8_t *const *const outptrs; + const int8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const int8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x5, #0x0\n" + "ldr x6, [%x[params], %[offsetof_Params_weights]]\n" + "mov x7, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x8, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x16, x4, #0x3\n" + "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v12.16b }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v13.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v11.4s }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v19.4s }, [x20]\n" + "ld1r { v14.4s }, [x19]\n" + "ldp x14, x13, [x21, #0x0]\n" + "ldp x12, x11, [x21, #0x10]\n" + "cbz x16, 3f\n" + "subs x16, x16, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "mov v20.16b, v15.16b\n" + "ldr q10, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v15.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v17.16b, v15.16b\n" + "ldr d0, [x6, #0x0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "mov v23.16b, v10.16b\n" + "ldr d1, [x6, #0x8]\n" + "mov v22.16b, v10.16b\n" + "ldr d2, [x6, #0x10]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "mov v18.16b, v10.16b\n" + "ldr d3, [x6, #0x18]\n" + "ldr d4, [x6, #0x20]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ldr d5, [x6, #0x28]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldr d6, [x6, #0x30]\n" + "ldr d7, [x6, #0x38]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldr d8, [x6, #0x40]\n" + "ssubl v5.8h, v5.8b, v13.8b\n" + "ldp x26, x25, [x8, #0x0]\n" + "ssubl v6.8h, v6.8b, v13.8b\n" + "ldp x24, x23, [x8, #0x10]\n" + "ssubl v7.8h, v7.8b, v13.8b\n" + "ssubl v8.8h, v8.8b, v13.8b\n" + "ldp x22, x21, [x8, #0x20]\n" + "ldp x20, x19, [x8, #0x30]\n" + "ldr d31, [x26, x5]\n" + "ssubl v31.8h, v31.8b, v12.8b\n" + "ldr d30, [x25, x5]\n" + "ldr d29, [x24, x5]\n" + "ssubl v30.8h, v30.8b, v12.8b\n" + "ldr d28, [x23, x5]\n" + "ldr d27, [x22, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "ldr d26, [x21, x5]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "ldr d25, [x20, x5]\n" + "ldr d24, [x19, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v15.4s, v31.4h, v8.4h\n" + "ldr x23, [x8, #0x40]\n" + "add x6, x6, #0x48\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ldr x22, [x8, #0x48]\n" + "subs x16, x16, #0x1\n" + "smlal v20.4s, v31.4h, v6.4h\n" + "ldr x21, [x8, #0x50]\n" + "smlal2 v23.4s, v31.8h, v6.8h\n" + "ldr x20, [x8, #0x58]\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ldr x19, [x8, #0x60]\n" + "smlal2 v22.4s, v31.8h, v2.8h\n" + "ldr x10, [x8, #0x68]\n" + "smlal v17.4s, v31.4h, v0.4h\n" + "ldr x9, [x8, #0x70]\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr x28, [x8, #0x78]\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "ldr x27, [x8, #0x80]\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x26, [x8, #0x88]\n" + "smlal v20.4s, v28.4h, v1.4h\n" + "ldr x25, [x8, #0x90]\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x5]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "ldr x24, [x8, #0x98]\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "ldr d29, [x23, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v2.4h\n" + "ldr x23, [x8, #0xa0]\n" + "smlal2 v23.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "ldr x22, [x8, #0xa8]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "ldr d26, [x20, x5]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "ldr x21, [x8, #0xb0]\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "ldr d25, [x19, x5]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "ldr x20, [x8, #0xb8]\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "ldr x19, [x8, #0xc0]\n" + "smlal v20.4s, v24.4h, v0.4h\n" + "ldr q21, [x17, #0x0]\n" + "smlal2 v23.4s, v24.8h, v0.8h\n" + "ldr d24, [x9, x5]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v4.4h\n" + "ldr q30, [x15, #0x0]\n" + "smlal2 v23.4s, v29.8h, v4.8h\n" + "ldr d29, [x10, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v5.4h\n" + "ldr q31, [x17, #0x10]\n" + "smlal2 v23.4s, v28.8h, v5.8h\n" + "ldr d28, [x27, x5]\n" + "add x17, x17, #0x20\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "ldr q9, [x15, #0x10]\n" + "add x15, x15, #0x20\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "ldr d27, [x28, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v26.4h, v3.4h\n" + "smlal2 v22.4s, v26.8h, v3.8h\n" + "ldr d26, [x26, x5]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v22.4s, v25.8h, v0.8h\n" + "ldr d25, [x25, x5]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v29.4h, v4.4h\n" + "smlal2 v22.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v22.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x5]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "smlal v17.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "smlal v16.4s, v25.4h, v6.4h\n" + "smlal2 v22.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x5]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "ldr d26, [x21, x5]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "ldr d29, [x19, x5]\n" + "add x5, x5, #0x8\n" + "smlal v16.4s, v27.4h, v7.4h\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v22.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v16.4s, v24.4h, v5.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "sqrdmulh v15.4s, v15.4s, v21.4s\n" + "smlal2 v22.4s, v24.8h, v5.8h\n" + "smlal v17.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "smlal v16.4s, v25.4h, v8.4h\n" + "smlal2 v22.4s, v25.8h, v8.8h\n" + "smlal v17.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "and v26.16b, v15.16b, v30.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "smlal v17.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "sqrdmulh v10.4s, v10.4s, v31.4s\n" + "sqrdmulh v20.4s, v20.4s, v21.4s\n" + "sqrdmulh v23.4s, v23.4s, v31.4s\n" + "sqrdmulh v16.4s, v16.4s, v21.4s\n" + "sqadd v15.4s, v15.4s, v26.4s\n" + "and v8.16b, v10.16b, v9.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v30.4s\n" + "and v4.16b, v20.16b, v30.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v2.16b, v23.16b, v9.16b\n" + "and v1.16b, v16.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "add v15.4s, v15.4s, v11.4s\n" + "sqadd v10.4s, v10.4s, v8.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v31.4s\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "smin v15.4s, v15.4s, v14.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "smax v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v30.4s\n" + "add v10.4s, v10.4s, v11.4s\n" + "srshl v23.4s, v23.4s, v9.4s\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "add v20.4s, v20.4s, v11.4s\n" + "add v23.4s, v23.4s, v11.4s\n" + "smax v10.4s, v10.4s, v19.4s\n" + "smin v20.4s, v20.4s, v14.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "uzp1 v15.16b, v15.16b, v10.16b\n" + "smax v20.4s, v20.4s, v19.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x14, x7]\n" + "smax v23.4s, v23.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v30.4s\n" + "and v24.16b, v22.16b, v9.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v23.16b\n" + "add v16.4s, v16.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v21.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str d20, [x13, x7]\n" + "smin v16.4s, v16.4s, v14.4s\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v2.16b, v17.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "smax v16.4s, v16.4s, v19.4s\n" + "srshl v22.4s, v22.4s, v9.4s\n" + "and v31.16b, v18.16b, v9.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "add v22.4s, v22.4s, v11.4s\n" + "sqadd v17.4s, v17.4s, v2.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v31.4s\n" + "smax v22.4s, v22.4s, v19.4s\n" + "uzp1 v16.16b, v16.16b, v22.16b\n" + "add v17.4s, v17.4s, v11.4s\n" + "srshl v18.4s, v18.4s, v9.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x12, x7]\n" + "smin v17.4s, v17.4s, v14.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "smax v17.4s, v17.4s, v19.4s\n" + "smin v18.4s, v18.4s, v14.4s\n" + "smax v18.4s, v18.4s, v19.4s\n" + "uzp1 v17.16b, v17.16b, v18.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x11, x7]\n" + "add x7, x7, #0x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "mov v20.16b, v15.16b\n" + "ldr q10, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v15.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v17.16b, v15.16b\n" + "ldr d0, [x6, #0x0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "mov v23.16b, v10.16b\n" + "ldr d1, [x6, #0x8]\n" + "mov v22.16b, v10.16b\n" + "ldr d2, [x6, #0x10]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "mov v18.16b, v10.16b\n" + "ldr d3, [x6, #0x18]\n" + "ldr d4, [x6, #0x20]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ldr d5, [x6, #0x28]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldr d6, [x6, #0x30]\n" + "ldr d7, [x6, #0x38]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldr d8, [x6, #0x40]\n" + "ssubl v5.8h, v5.8b, v13.8b\n" + "ldp x26, x25, [x8, #0x0]\n" + "ssubl v6.8h, v6.8b, v13.8b\n" + "ldp x24, x23, [x8, #0x10]\n" + "ssubl v7.8h, v7.8b, v13.8b\n" + "ssubl v8.8h, v8.8b, v13.8b\n" + "ldp x22, x21, [x8, #0x20]\n" + "ldp x20, x19, [x8, #0x30]\n" + "ldr d31, [x26, x5]\n" + "ssubl v31.8h, v31.8b, v12.8b\n" + "ldr d30, [x25, x5]\n" + "ldr d29, [x24, x5]\n" + "ssubl v30.8h, v30.8b, v12.8b\n" + "ldr d28, [x23, x5]\n" + "ldr d27, [x22, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "ldr d26, [x21, x5]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "ldr d25, [x20, x5]\n" + "ldr d24, [x19, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v15.4s, v31.4h, v8.4h\n" + "ldr x23, [x8, #0x40]\n" + "tst x4, #0x7\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ldr x22, [x8, #0x48]\n" + "smlal v20.4s, v31.4h, v6.4h\n" + "ldr x21, [x8, #0x50]\n" + "smlal2 v23.4s, v31.8h, v6.8h\n" + "ldr x20, [x8, #0x58]\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ldr x19, [x8, #0x60]\n" + "smlal2 v22.4s, v31.8h, v2.8h\n" + "ldr x10, [x8, #0x68]\n" + "smlal v17.4s, v31.4h, v0.4h\n" + "ldr x9, [x8, #0x70]\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr x28, [x8, #0x78]\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "ldr x27, [x8, #0x80]\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x26, [x8, #0x88]\n" + "smlal v20.4s, v28.4h, v1.4h\n" + "ldr x25, [x8, #0x90]\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x5]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "ldr x24, [x8, #0x98]\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "ldr d29, [x23, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v2.4h\n" + "ldr x23, [x8, #0xa0]\n" + "smlal2 v23.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "ldr x22, [x8, #0xa8]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "ldr d26, [x20, x5]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "ldr x21, [x8, #0xb0]\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "ldr d25, [x19, x5]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "ldr x20, [x8, #0xb8]\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "ldr x19, [x8, #0xc0]\n" + "smlal v20.4s, v24.4h, v0.4h\n" + "ldr q21, [x17, #0x0]\n" + "smlal2 v23.4s, v24.8h, v0.8h\n" + "ldr d24, [x9, x5]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v4.4h\n" + "ldr q30, [x15, #0x0]\n" + "smlal2 v23.4s, v29.8h, v4.8h\n" + "ldr d29, [x10, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v5.4h\n" + "ldr q31, [x17, #0x10]\n" + "smlal2 v23.4s, v28.8h, v5.8h\n" + "ldr d28, [x27, x5]\n" + "add x17, x17, #0x20\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "ldr q9, [x15, #0x10]\n" + "add x15, x15, #0x20\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "ldr d27, [x28, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v26.4h, v3.4h\n" + "smlal2 v22.4s, v26.8h, v3.8h\n" + "ldr d26, [x26, x5]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v22.4s, v25.8h, v0.8h\n" + "ldr d25, [x25, x5]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v29.4h, v4.4h\n" + "smlal2 v22.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x5]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v22.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x5]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x5]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "smlal v17.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "smlal v16.4s, v25.4h, v6.4h\n" + "smlal2 v22.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x5]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "ldr d26, [x21, x5]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "ldr d29, [x19, x5]\n" + "add x5, x5, #0x8\n" + "smlal v16.4s, v27.4h, v7.4h\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v22.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v16.4s, v24.4h, v5.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "sqrdmulh v15.4s, v15.4s, v21.4s\n" + "smlal2 v22.4s, v24.8h, v5.8h\n" + "smlal v17.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "smlal v16.4s, v25.4h, v8.4h\n" + "smlal2 v22.4s, v25.8h, v8.8h\n" + "smlal v17.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "and v26.16b, v15.16b, v30.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "smlal v17.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "sqrdmulh v10.4s, v10.4s, v31.4s\n" + "sqrdmulh v20.4s, v20.4s, v21.4s\n" + "sqrdmulh v23.4s, v23.4s, v31.4s\n" + "sqrdmulh v16.4s, v16.4s, v21.4s\n" + "sqadd v15.4s, v15.4s, v26.4s\n" + "and v8.16b, v10.16b, v9.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v30.4s\n" + "and v4.16b, v20.16b, v30.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v2.16b, v23.16b, v9.16b\n" + "and v1.16b, v16.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "add v15.4s, v15.4s, v11.4s\n" + "sqadd v10.4s, v10.4s, v8.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v31.4s\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "smin v15.4s, v15.4s, v14.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "smax v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v30.4s\n" + "add v10.4s, v10.4s, v11.4s\n" + "srshl v23.4s, v23.4s, v9.4s\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "add v20.4s, v20.4s, v11.4s\n" + "add v23.4s, v23.4s, v11.4s\n" + "smax v10.4s, v10.4s, v19.4s\n" + "smin v20.4s, v20.4s, v14.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "uzp1 v15.16b, v15.16b, v10.16b\n" + "smax v20.4s, v20.4s, v19.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x14, x7]\n" + "smax v23.4s, v23.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v30.4s\n" + "and v24.16b, v22.16b, v9.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v23.16b\n" + "add v16.4s, v16.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v21.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str d20, [x13, x7]\n" + "smin v16.4s, v16.4s, v14.4s\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v2.16b, v17.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "smax v16.4s, v16.4s, v19.4s\n" + "srshl v22.4s, v22.4s, v9.4s\n" + "and v31.16b, v18.16b, v9.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "add v22.4s, v22.4s, v11.4s\n" + "sqadd v17.4s, v17.4s, v2.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v31.4s\n" + "smax v22.4s, v22.4s, v19.4s\n" + "uzp1 v16.16b, v16.16b, v22.16b\n" + "add v17.4s, v17.4s, v11.4s\n" + "srshl v18.4s, v18.4s, v9.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x12, x7]\n" + "smin v17.4s, v17.4s, v14.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "smax v17.4s, v17.4s, v19.4s\n" + "smin v18.4s, v18.4s, v14.4s\n" + "smax v18.4s, v18.4s, v19.4s\n" + "uzp1 v17.16b, v17.16b, v18.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x11, x7]\n" + "add x7, x7, #0x8\n" + "beq 88f\n" + "add x6, x6, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x4, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x4, #1, 4f\n" + "ld1 { v10.d }[0], [x19], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v10.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x4, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v20.16b, v15.16b\n" + "ldr d0, [x6, #0x0]\n" + "mov v23.16b, v10.16b\n" + "ldr d1, [x6, #0x8]\n" + "mov v16.16b, v15.16b\n" + "ldr d2, [x6, #0x10]\n" + "mov v22.16b, v10.16b\n" + "ldr d3, [x6, #0x18]\n" + "mov v17.16b, v15.16b\n" + "ldr d4, [x6, #0x20]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "mov v18.16b, v10.16b\n" + "ldr d5, [x6, #0x28]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "ldr d6, [x6, #0x30]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ldr d7, [x6, #0x38]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldr d8, [x6, #0x40]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldp x26, x25, [x8, #0x0]\n" + "ssubl v5.8h, v5.8b, v13.8b\n" + "ldp x24, x23, [x8, #0x10]\n" + "ssubl v6.8h, v6.8b, v13.8b\n" + "ssubl v7.8h, v7.8b, v13.8b\n" + "ldp x22, x21, [x8, #0x20]\n" + "ssubl v8.8h, v8.8b, v13.8b\n" + "ldp x20, x19, [x8, #0x30]\n" + "add x26, x26, x5\n" + "add x25, x25, x5\n" + "add x24, x24, x5\n" + "add x23, x23, x5\n" + "add x22, x22, x5\n" + "add x21, x21, x5\n" + "add x20, x20, x5\n" + "add x19, x19, x5\n" + "tbz x4, #2, 9f\n" + "ld1 { v31.s }[0], [x26], #0x4\n" + "ld1 { v30.s }[0], [x25], #0x4\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x4, #1, 8f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v30.h }[2], [x25], #0x2\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v30.b }[6], [x25]\n" + "ld1 { v29.b }[6], [x24]\n" + "ld1 { v28.b }[6], [x23]\n" + "ld1 { v27.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v30.b }[4], [x25]\n" + "ld1 { v29.b }[4], [x24]\n" + "ld1 { v28.b }[4], [x23]\n" + "ld1 { v27.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "ld1 { v24.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x4, #1, 10f\n" + "ld1 { v31.h }[0], [x26], #0x2\n" + "ld1 { v30.h }[0], [x25], #0x2\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v30.b }[2], [x25]\n" + "ld1 { v29.b }[2], [x24]\n" + "ld1 { v28.b }[2], [x23]\n" + "ld1 { v27.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[0], [x26]\n" + "ld1 { v30.b }[0], [x25]\n" + "ld1 { v29.b }[0], [x24]\n" + "ld1 { v28.b }[0], [x23]\n" + "ld1 { v27.b }[0], [x22]\n" + "ld1 { v26.b }[0], [x21]\n" + "ld1 { v25.b }[0], [x20]\n" + "ld1 { v24.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ldr x23, [x8, #0x40]\n" + "ssubl v31.8h, v31.8b, v12.8b\n" + "smlal v15.4s, v31.4h, v8.4h\n" + "ssubl v30.8h, v30.8b, v12.8b\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v31.4h, v6.4h\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal2 v23.4s, v31.8h, v6.8h\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal2 v22.4s, v31.8h, v2.8h\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v17.4s, v31.4h, v0.4h\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "add x23, x23, x5\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "smlal v20.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "smlal v20.4s, v27.4h, v2.4h\n" + "smlal2 v23.4s, v27.8h, v2.8h\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "smlal v20.4s, v24.4h, v0.4h\n" + "smlal2 v23.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "tbz x4, #2, 13f\n" + "ld1 { v29.s }[0], [x23], #0x4\n" + "tbz x4, #1, 12f\n" + "ld1 { v29.h }[2], [x23], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[6], [x23]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[4], [x23]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x4, #1, 14f\n" + "ld1 { v29.h }[0], [x23], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[2], [x23]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[0], [x23]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "ldr x22, [x8, #0x48]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v4.4h\n" + "smlal2 v23.4s, v29.8h, v4.8h\n" + "add x22, x22, x5\n" + "tbz x4, #2, 17f\n" + "ld1 { v28.s }[0], [x22], #0x4\n" + "tbz x4, #1, 16f\n" + "ld1 { v28.h }[2], [x22], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[6], [x22]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[4], [x22]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x4, #1, 18f\n" + "ld1 { v28.h }[0], [x22], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[2], [x22]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[0], [x22]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "ldr x21, [x8, #0x50]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v5.4h\n" + "smlal2 v23.4s, v28.8h, v5.8h\n" + "add x21, x21, x5\n" + "tbz x4, #2, 21f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x4, #1, 20f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[6], [x21]\n" + "b 23f\n" + "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[4], [x21]\n" + "b 23f\n" + "21:" // Oddments: Load (1, 2): Bit 2: Unset + "tbz x4, #1, 22f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[2], [x21]\n" + "b 23f\n" + "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[0], [x21]\n" + "23:" // Oddments: Load (1, 2): Bit 2: End + "ldr x20, [x8, #0x58]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "add x20, x20, x5\n" + "smlal v20.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "tbz x4, #2, 25f\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "tbz x4, #1, 24f\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[6], [x20]\n" + "b 27f\n" + "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[4], [x20]\n" + "b 27f\n" + "25:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x4, #1, 26f\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[2], [x20]\n" + "b 27f\n" + "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[0], [x20]\n" + "27:" // Oddments: Load (3, 0): Bit 2: End + "ldr x19, [x8, #0x60]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v16.4s, v26.4h, v3.4h\n" + "smlal2 v22.4s, v26.8h, v3.8h\n" + "add x19, x19, x5\n" + "tbz x4, #2, 29f\n" + "ld1 { v25.s }[0], [x19], #0x4\n" + "tbz x4, #1, 28f\n" + "ld1 { v25.h }[2], [x19], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[6], [x19]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[4], [x19]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x4, #1, 30f\n" + "ld1 { v25.h }[0], [x19], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[2], [x19]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[0], [x19]\n" + "31:" // Oddments: Load (2, 0): Bit 2: End + "ldr x10, [x8, #0x68]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "add x10, x10, x5\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v22.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 33f\n" + "ld1 { v29.s }[0], [x10], #0x4\n" + "tbz x4, #1, 32f\n" + "ld1 { v29.h }[2], [x10], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[6], [x10]\n" + "b 35f\n" + "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[4], [x10]\n" + "b 35f\n" + "33:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x4, #1, 34f\n" + "ld1 { v29.h }[0], [x10], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[2], [x10]\n" + "b 35f\n" + "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[0], [x10]\n" + "35:" // Oddments: Load (3, 1): Bit 2: End + "ldr x9, [x8, #0x70]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v16.4s, v29.4h, v4.4h\n" + "smlal2 v22.4s, v29.8h, v4.8h\n" + "add x9, x9, x5\n" + "tbz x4, #2, 37f\n" + "ld1 { v24.s }[0], [x9], #0x4\n" + "tbz x4, #1, 36f\n" + "ld1 { v24.h }[2], [x9], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[6], [x9]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[4], [x9]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x4, #1, 38f\n" + "ld1 { v24.h }[0], [x9], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[2], [x9]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[0], [x9]\n" + "39:" // Oddments: Load (2, 1): Bit 2: End + "ldr x28, [x8, #0x78]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "add x28, x28, x5\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v22.4s, v24.8h, v1.8h\n" + "tbz x4, #2, 41f\n" + "ld1 { v27.s }[0], [x28], #0x4\n" + "tbz x4, #1, 40f\n" + "ld1 { v27.h }[2], [x28], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[6], [x28]\n" + "b 43f\n" + "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[4], [x28]\n" + "b 43f\n" + "41:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x4, #1, 42f\n" + "ld1 { v27.h }[0], [x28], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[2], [x28]\n" + "b 43f\n" + "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[0], [x28]\n" + "43:" // Oddments: Load (3, 3): Bit 2: End + "ldr x27, [x8, #0x80]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "add x27, x27, x5\n" + "tbz x4, #2, 45f\n" + "ld1 { v28.s }[0], [x27], #0x4\n" + "tbz x4, #1, 44f\n" + "ld1 { v28.h }[2], [x27], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[6], [x27]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[4], [x27]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x4, #1, 46f\n" + "ld1 { v28.h }[0], [x27], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[2], [x27]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[0], [x27]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "ldr x26, [x8, #0x88]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "add x26, x26, x5\n" + "smlal v17.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "tbz x4, #2, 49f\n" + "ld1 { v26.s }[0], [x26], #0x4\n" + "tbz x4, #1, 48f\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[6], [x26]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[4], [x26]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x4, #1, 50f\n" + "ld1 { v26.h }[0], [x26], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[2], [x26]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[0], [x26]\n" + "51:" // Oddments: Load (3, 4): Bit 2: End + "ldr x25, [x8, #0x90]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "add x25, x25, x5\n" + "tbz x4, #2, 53f\n" + "ld1 { v25.s }[0], [x25], #0x4\n" + "tbz x4, #1, 52f\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[6], [x25]\n" + "b 55f\n" + "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[4], [x25]\n" + "b 55f\n" + "53:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x4, #1, 54f\n" + "ld1 { v25.h }[0], [x25], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[2], [x25]\n" + "b 55f\n" + "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[0], [x25]\n" + "55:" // Oddments: Load (4, 0): Bit 2: End + "ldr x24, [x8, #0x98]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v25.4h, v6.4h\n" + "smlal2 v22.4s, v25.8h, v6.8h\n" + "add x24, x24, x5\n" + "tbz x4, #2, 57f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x4, #1, 56f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 59f\n" + "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 59f\n" + "57:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x4, #1, 58f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 59f\n" + "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[0], [x24]\n" + "59:" // Oddments: Load (2, 4): Bit 2: End + "ldr x23, [x8, #0xa0]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "add x23, x23, x5\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "tbz x4, #2, 61f\n" + "ld1 { v27.s }[0], [x23], #0x4\n" + "tbz x4, #1, 60f\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[6], [x23]\n" + "b 63f\n" + "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[4], [x23]\n" + "b 63f\n" + "61:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x4, #1, 62f\n" + "ld1 { v27.h }[0], [x23], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[2], [x23]\n" + "b 63f\n" + "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[0], [x23]\n" + "63:" // Oddments: Load (4, 1): Bit 2: End + "ldr x22, [x8, #0xa8]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v27.4h, v7.4h\n" + "smlal2 v22.4s, v27.8h, v7.8h\n" + "add x22, x22, x5\n" + "tbz x4, #2, 65f\n" + "ld1 { v24.s }[0], [x22], #0x4\n" + "tbz x4, #1, 64f\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[6], [x22]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[4], [x22]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x4, #1, 66f\n" + "ld1 { v24.h }[0], [x22], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[2], [x22]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[0], [x22]\n" + "67:" // Oddments: Load (3, 2): Bit 2: End + "ldr x21, [x8, #0xb0]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v16.4s, v24.4h, v5.4h\n" + "smlal2 v22.4s, v24.8h, v5.8h\n" + "add x21, x21, x5\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "tbz x4, #2, 69f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x4, #1, 68f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[6], [x21]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[4], [x21]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x4, #1, 70f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[2], [x21]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[0], [x21]\n" + "71:" // Oddments: Load (4, 3): Bit 2: End + "ldr x20, [x8, #0xb8]\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "add x20, x20, x5\n" + "tbz x4, #2, 73f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x4, #1, 72f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x4, #1, 74f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[0], [x20]\n" + "75:" // Oddments: Load (4, 2): Bit 2: End + "ldr x19, [x8, #0xc0]\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v25.4h, v8.4h\n" + "smlal2 v22.4s, v25.8h, v8.8h\n" + "add x19, x19, x5\n" + "smlal v17.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "tbz x4, #2, 77f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x4, #1, 76f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[6], [x19]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[4], [x19]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x4, #1, 78f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[2], [x19]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[0], [x19]\n" + "79:" // Oddments: Load (4, 4): Bit 2: End + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v17.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "tbz x4, #2, 81f\n" + "ld1 { v21.4s }, [x17], #0x10\n" + "ld1 { v30.4s }, [x15], #0x10\n" + "tbz x4, #1, 80f\n" + "ld1 { v31.d }[0], [x17], #0x8\n" + "ld1 { v9.d }[0], [x15], #0x8\n" + "tbz x4, #0, 83f\n" + "ld1 { v31.s }[2], [x17]\n" + "ld1 { v9.s }[2], [x15]\n" + "b 83f\n" + "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v31.s }[0], [x17]\n" + "ld1 { v9.s }[0], [x15]\n" + "b 83f\n" + "81:" // Oddments: Load requant params: Bit 2: Unset + "tbz x4, #1, 82f\n" + "ld1 { v21.d }[0], [x17], #0x8\n" + "ld1 { v30.d }[0], [x15], #0x8\n" + "tbz x4, #0, 83f\n" + "ld1 { v21.s }[2], [x17]\n" + "ld1 { v30.s }[2], [x15]\n" + "b 83f\n" + "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v21.s }[0], [x17]\n" + "ld1 { v30.s }[0], [x15]\n" + "83:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v15.4s, v15.4s, v21.4s\n" + "add x14, x14, x7\n" + "sqrdmulh v10.4s, v10.4s, v31.4s\n" + "add x13, x13, x7\n" + "sqrdmulh v20.4s, v20.4s, v21.4s\n" + "add x12, x12, x7\n" + "sqrdmulh v23.4s, v23.4s, v31.4s\n" + "add x11, x11, x7\n" + "sqrdmulh v16.4s, v16.4s, v21.4s\n" + "and v26.16b, v15.16b, v30.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v8.16b, v10.16b, v9.16b\n" + "and v4.16b, v20.16b, v30.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v2.16b, v23.16b, v9.16b\n" + "and v1.16b, v16.16b, v30.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v31.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v26.4s\n" + "sqrdmulh v17.4s, v17.4s, v21.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "sqadd v10.4s, v10.4s, v8.4s\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "srshl v15.4s, v15.4s, v30.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "srshl v20.4s, v20.4s, v30.4s\n" + "add v15.4s, v15.4s, v11.4s\n" + "srshl v23.4s, v23.4s, v9.4s\n" + "add v10.4s, v10.4s, v11.4s\n" + "smin v15.4s, v15.4s, v14.4s\n" + "add v20.4s, v20.4s, v11.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "smax v15.4s, v15.4s, v19.4s\n" + "smin v20.4s, v20.4s, v14.4s\n" + "smax v10.4s, v10.4s, v19.4s\n" + "add v23.4s, v23.4s, v11.4s\n" + "smax v20.4s, v20.4s, v19.4s\n" + "uzp1 v15.16b, v15.16b, v10.16b\n" + "smin v23.4s, v23.4s, v14.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "smax v23.4s, v23.4s, v19.4s\n" + "and v24.16b, v22.16b, v9.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v23.16b\n" + "srshl v16.4s, v16.4s, v30.4s\n" + "and v2.16b, v17.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "add v16.4s, v16.4s, v11.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v31.16b, v18.16b, v9.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "smin v16.4s, v16.4s, v14.4s\n" + "srshl v22.4s, v22.4s, v9.4s\n" + "sqadd v17.4s, v17.4s, v2.4s\n" + "smax v16.4s, v16.4s, v19.4s\n" + "add v22.4s, v22.4s, v11.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v31.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "srshl v18.4s, v18.4s, v9.4s\n" + "smax v22.4s, v22.4s, v19.4s\n" + "smin v17.4s, v17.4s, v14.4s\n" + "uzp1 v16.16b, v16.16b, v22.16b\n" + "add v18.4s, v18.4s, v11.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "smax v17.4s, v17.4s, v19.4s\n" + "smin v18.4s, v18.4s, v14.4s\n" + "smax v18.4s, v18.4s, v19.4s\n" + "uzp1 v17.16b, v17.16b, v18.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "tbz x4, #2, 85f\n" + "st1 { v15.s }[0], [x14], #0x4\n" + "st1 { v20.s }[0], [x13], #0x4\n" + "st1 { v16.s }[0], [x12], #0x4\n" + "st1 { v17.s }[0], [x11], #0x4\n" + "tbz x4, #1, 84f\n" + "st1 { v15.h }[2], [x14], #0x2\n" + "st1 { v20.h }[2], [x13], #0x2\n" + "st1 { v16.h }[2], [x12], #0x2\n" + "st1 { v17.h }[2], [x11], #0x2\n" + "tbz x4, #0, 87f\n" + "st1 { v15.b }[6], [x14], #0x1\n" + "st1 { v20.b }[6], [x13], #0x1\n" + "st1 { v16.b }[6], [x12], #0x1\n" + "st1 { v17.b }[6], [x11], #0x1\n" + "b 87f\n" + "84:" // Oddments: Bit 2: Bit 1: Unset + "tbz x4, #0, 87f\n" + "st1 { v15.b }[4], [x14], #0x1\n" + "st1 { v20.b }[4], [x13], #0x1\n" + "st1 { v16.b }[4], [x12], #0x1\n" + "st1 { v17.b }[4], [x11], #0x1\n" + "b 87f\n" + "85:" // Oddments: Bit 2: Unset + "tbz x4, #1, 86f\n" + "st1 { v15.h }[0], [x14], #0x2\n" + "st1 { v20.h }[0], [x13], #0x2\n" + "st1 { v16.h }[0], [x12], #0x2\n" + "st1 { v17.h }[0], [x11], #0x2\n" + "tbz x4, #0, 87f\n" + "st1 { v15.b }[2], [x14], #0x1\n" + "st1 { v20.b }[2], [x13], #0x1\n" + "st1 { v16.b }[2], [x12], #0x1\n" + "st1 { v17.b }[2], [x11], #0x1\n" + "b 87f\n" + "86:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 87f\n" + "st1 { v15.b }[0], [x14], #0x1\n" + "st1 { v20.b }[0], [x13], #0x1\n" + "st1 { v16.b }[0], [x12], #0x1\n" + "st1 { v17.b }[0], [x11], #0x1\n" + "87:" // Oddments: Bit 2: End + + "88:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..a998fa16d6 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + +struct a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size; + + kern_type kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + + a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..ab64f53f66 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,2213 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + int8_t *const *const outptrs; + const int8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const int8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x10, #0x0\n" + "ldr x3, [%x[params], %[offsetof_Params_weights]]\n" + "mov x1, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x25, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x19, x4, #0x3\n" + "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x13, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v7.16b }, [x13]\n" + "add x8, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v13.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v19.4s }, [x8]\n" + "add x8, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v16.4s }, [x20]\n" + "ld1r { v12.4s }, [x8]\n" + "ldp x17, x16, [x21, #0x0]\n" + "ldp x6, x8, [x21, #0x10]\n" + "cbz x19, 3f\n" + "subs x19, x19, #0x1\n" + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x12, #0x0]\n" + "mov v18.16b, v15.16b\n" + "ldr q20, [x12, #0x10]\n" + "add x12, x12, #0x20\n" + "mov v11.16b, v15.16b\n" + "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "mov v10.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "mov v5.16b, v20.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v8.16b, v20.16b\n" + "ldr d2, [x3, #0x10]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "mov v9.16b, v20.16b\n" + "ldr d3, [x3, #0x18]\n" + "ldr d4, [x3, #0x20]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ldp x28, x27, [x25, #0x0]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "ldr d31, [x28, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "ldr d30, [x27, x10]\n" + "ldr d29, [x26, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "ldr d28, [x13, x10]\n" + "ldr d27, [x24, x10]\n" + "ssubl v29.8h, v29.8b, v7.8b\n" + "ldr d23, [x23, x10]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "ldr d25, [x22, x10]\n" + "ldr d24, [x21, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "ldr d26, [x20, x10]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "ldr d22, [x0, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "ssubl v22.8h, v22.8b, v7.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x20, [x25, #0x50]\n" + "subs x19, x19, #0x1\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr x28, [x25, #0x58]\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "ldr x0, [x25, #0x60]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ldr d31, [x20, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ldr x7, [x25, #0x68]\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "ldr x26, [x25, #0x70]\n" + "smlal v10.4s, v28.4h, v0.4h\n" + "ldr x23, [x25, #0x78]\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d0, [x3, #0x28]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "ldr x20, [x25, #0x80]\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v1.4h\n" + "ldr x22, [x25, #0x88]\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "ldr x13, [x25, #0x90]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "ldr x21, [x25, #0x98]\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "ldr x14, [x25, #0xa0]\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "smlal2 v9.4s, v23.8h, v1.8h\n" + "ldr d1, [x3, #0x30]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "ldr d27, [x0, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "ldr x0, [x25, #0xb8]\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr x15, [x25, #0xc0]\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "ldr x9, [x25, #0xc8]\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "ldr x27, [x25, #0xd0]\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr x28, [x25, #0xd8]\n" + "smlal2 v9.4s, v31.8h, v2.8h\n" + "ldr d2, [x3, #0x38]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr q6, [x2, #0x0]\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x7, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "ldr x12, [x25, #0xe0]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ldr q21, [x5, #0x0]\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "ldr q17, [x2, #0x10]\n" + "add x2, x2, #0x20\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "ldr q14, [x5, #0x10]\n" + "add x5, x5, #0x20\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v9.4s, v30.8h, v3.8h\n" + "ldr d3, [x3, #0x40]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x26, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "ldr x7, [x25, #0xe8]\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0x48]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v20.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v9.4s, v25.8h, v0.8h\n" + "ldr d0, [x3, #0x50]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x10]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v1.4h\n" + "ldr x23, [x25, #0xf8]\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v9.4s, v24.8h, v1.8h\n" + "ldr d1, [x3, #0x58]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v20.4s, v23.8h, v2.8h\n" + "ldr d23, [x20, x10]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v31.4h, v2.4h\n" + "ldr x22, [x25, #0x100]\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v9.4s, v27.8h, v2.8h\n" + "ldr d2, [x3, #0x60]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v20.4s, v31.8h, v3.8h\n" + "ldr d31, [x13, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v3.4h\n" + "ldr x20, [x25, #0x108]\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "smlal v10.4s, v23.4h, v3.4h\n" + "smlal2 v9.4s, v23.8h, v3.8h\n" + "ldr d3, [x3, #0x68]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "ldr x13, [x25, #0x110]\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d26, [x14, x10]\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "ldr x21, [x25, #0x118]\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr d4, [x3, #0x70]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v20.4s, v22.8h, v0.8h\n" + "ldr d22, [x0, x10]\n" + "ssubl v22.8h, v22.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "smlal v10.4s, v30.4h, v0.4h\n" + "smlal2 v9.4s, v30.8h, v0.8h\n" + "ldr d0, [x3, #0x78]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v20.4s, v25.8h, v1.8h\n" + "ldr d25, [x11, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "smlal v10.4s, v26.4h, v1.4h\n" + "smlal2 v9.4s, v26.8h, v1.8h\n" + "ldr d1, [x3, #0x80]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v20.4s, v24.8h, v2.8h\n" + "ldr d24, [x24, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "ldr d2, [x3, #0x88]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "ldr d27, [x15, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "ldr d3, [x3, #0x90]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v20.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x10]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d28, [x12, x10]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v22.4h, v4.4h\n" + "smlal2 v9.4s, v22.8h, v4.8h\n" + "ldr d4, [x3, #0x98]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x27, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "smlal v10.4s, v23.4h, v0.4h\n" + "smlal2 v9.4s, v23.8h, v0.8h\n" + "ldr d0, [x3, #0xa0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d1, [x3, #0xa8]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v20.4s, v26.8h, v2.8h\n" + "ldr d26, [x7, x10]\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v30.4h, v2.4h\n" + "smlal2 v9.4s, v30.8h, v2.8h\n" + "ldr d2, [x3, #0xb0]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x26, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v28.4h, v3.4h\n" + "smlal2 v9.4s, v28.8h, v3.8h\n" + "ldr d3, [x3, #0xb8]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0xc0]\n" + "add x3, x3, #0xc8\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v20.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x10]\n" + "smlal v18.4s, v23.4h, v0.4h\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ldr d25, [x20, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v0.4h\n" + "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v20.4s, v23.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "ldr d24, [x13, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v1.4h\n" + "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v20.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x10]\n" + "add x10, x10, #0x8\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v20.4s, v30.8h, v3.8h\n" + "smlal v18.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v20.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "sqrdmulh v20.4s, v20.4s, v17.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v5.4s, v5.4s, v17.4s\n" + "and v1.16b, v15.16b, v21.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v29.16b, v20.16b, v14.16b\n" + "and v3.16b, v18.16b, v21.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v2.16b, v5.16b, v14.16b\n" + "sqrdmulh v11.4s, v11.4s, v6.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v1.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "and v0.16b, v11.16b, v21.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "sqadd v5.4s, v5.4s, v2.4s\n" + "and v27.16b, v8.16b, v14.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "add v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v21.4s\n" + "srshl v5.4s, v5.4s, v14.4s\n" + "smin v15.4s, v15.4s, v12.4s\n" + "add v20.4s, v20.4s, v19.4s\n" + "add v18.4s, v18.4s, v19.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smin v18.4s, v18.4s, v12.4s\n" + "add v5.4s, v5.4s, v19.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smin v5.4s, v5.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "sqadd v11.4s, v11.4s, v0.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x17, x1]\n" + "smax v5.4s, v5.4s, v16.4s\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v21.4s\n" + "and v30.16b, v10.16b, v21.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v5.16b\n" + "add v11.4s, v11.4s, v19.4s\n" + "srshl v8.4s, v8.4s, v14.4s\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d18, [x16, x1]\n" + "smin v11.4s, v11.4s, v12.4s\n" + "sqrdmulh v9.4s, v9.4s, v17.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "sqadd v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "smin v8.4s, v8.4s, v12.4s\n" + "and v6.16b, v9.16b, v14.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "smax v8.4s, v8.4s, v16.4s\n" + "srshl v10.4s, v10.4s, v21.4s\n" + "uzp1 v11.16b, v11.16b, v8.16b\n" + "add v10.4s, v10.4s, v19.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x6, x1]\n" + "smin v10.4s, v10.4s, v12.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "srshl v9.4s, v9.4s, v14.4s\n" + "add v9.4s, v9.4s, v19.4s\n" + "smin v9.4s, v9.4s, v12.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "str d10, [x8, x1]\n" + "add x1, x1, #0x8\n" + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x12, #0x0]\n" + "mov v18.16b, v15.16b\n" + "ldr q20, [x12, #0x10]\n" + "add x12, x12, #0x20\n" + "mov v11.16b, v15.16b\n" + "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "mov v10.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "mov v5.16b, v20.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v8.16b, v20.16b\n" + "ldr d2, [x3, #0x10]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "mov v9.16b, v20.16b\n" + "ldr d3, [x3, #0x18]\n" + "ldr d4, [x3, #0x20]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ldp x28, x27, [x25, #0x0]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "ldr d31, [x28, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "ldr d30, [x27, x10]\n" + "ldr d29, [x26, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "ldr d28, [x13, x10]\n" + "ldr d27, [x24, x10]\n" + "ssubl v29.8h, v29.8b, v7.8b\n" + "ldr d23, [x23, x10]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "ldr d25, [x22, x10]\n" + "ldr d24, [x21, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "ldr d26, [x20, x10]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "ldr d22, [x0, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "ssubl v22.8h, v22.8b, v7.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x20, [x25, #0x50]\n" + "tst x4, #0x7\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr x28, [x25, #0x58]\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "ldr x0, [x25, #0x60]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ldr d31, [x20, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ldr x7, [x25, #0x68]\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "ldr x26, [x25, #0x70]\n" + "smlal v10.4s, v28.4h, v0.4h\n" + "ldr x23, [x25, #0x78]\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d0, [x3, #0x28]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "ldr x20, [x25, #0x80]\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v1.4h\n" + "ldr x22, [x25, #0x88]\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "ldr x13, [x25, #0x90]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "ldr x21, [x25, #0x98]\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "ldr x14, [x25, #0xa0]\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "smlal2 v9.4s, v23.8h, v1.8h\n" + "ldr d1, [x3, #0x30]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "ldr d27, [x0, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "ldr x0, [x25, #0xb8]\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr x15, [x25, #0xc0]\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "ldr x9, [x25, #0xc8]\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "ldr x27, [x25, #0xd0]\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr x28, [x25, #0xd8]\n" + "smlal2 v9.4s, v31.8h, v2.8h\n" + "ldr d2, [x3, #0x38]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr x12, [x25, #0xe0]\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x7, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "ldr x7, [x25, #0xe8]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ldr q6, [x2, #0x0]\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "ldr q21, [x5, #0x0]\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "ldr q17, [x2, #0x10]\n" + "add x2, x2, #0x20\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr q14, [x5, #0x10]\n" + "add x5, x5, #0x20\n" + "smlal2 v9.4s, v30.8h, v3.8h\n" + "ldr d3, [x3, #0x40]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x26, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "ldr x23, [x25, #0xf8]\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0x48]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v20.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v9.4s, v25.8h, v0.8h\n" + "ldr d0, [x3, #0x50]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x10]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v1.4h\n" + "ldr x22, [x25, #0x100]\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v9.4s, v24.8h, v1.8h\n" + "ldr d1, [x3, #0x58]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v20.4s, v23.8h, v2.8h\n" + "ldr d23, [x20, x10]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v31.4h, v2.4h\n" + "ldr x20, [x25, #0x108]\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v9.4s, v27.8h, v2.8h\n" + "ldr d2, [x3, #0x60]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v20.4s, v31.8h, v3.8h\n" + "ldr d31, [x13, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v3.4h\n" + "ldr x13, [x25, #0x110]\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "smlal v10.4s, v23.4h, v3.4h\n" + "smlal2 v9.4s, v23.8h, v3.8h\n" + "ldr d3, [x3, #0x68]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "ldr x21, [x25, #0x118]\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d26, [x14, x10]\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr d4, [x3, #0x70]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v20.4s, v22.8h, v0.8h\n" + "ldr d22, [x0, x10]\n" + "ssubl v22.8h, v22.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "smlal v10.4s, v30.4h, v0.4h\n" + "smlal2 v9.4s, v30.8h, v0.8h\n" + "ldr d0, [x3, #0x78]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v20.4s, v25.8h, v1.8h\n" + "ldr d25, [x11, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "smlal v10.4s, v26.4h, v1.4h\n" + "smlal2 v9.4s, v26.8h, v1.8h\n" + "ldr d1, [x3, #0x80]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v20.4s, v24.8h, v2.8h\n" + "ldr d24, [x24, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "ldr d2, [x3, #0x88]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "ldr d27, [x15, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "ldr d3, [x3, #0x90]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v20.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x10]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d28, [x12, x10]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v22.4h, v4.4h\n" + "smlal2 v9.4s, v22.8h, v4.8h\n" + "ldr d4, [x3, #0x98]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x27, x10]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "smlal v10.4s, v23.4h, v0.4h\n" + "smlal2 v9.4s, v23.8h, v0.8h\n" + "ldr d0, [x3, #0xa0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d1, [x3, #0xa8]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v20.4s, v26.8h, v2.8h\n" + "ldr d26, [x7, x10]\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v30.4h, v2.4h\n" + "smlal2 v9.4s, v30.8h, v2.8h\n" + "ldr d2, [x3, #0xb0]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x26, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v28.4h, v3.4h\n" + "smlal2 v9.4s, v28.8h, v3.8h\n" + "ldr d3, [x3, #0xb8]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0xc0]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v20.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x10]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ldr d25, [x20, x10]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v0.4h\n" + "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v20.4s, v23.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "ldr d24, [x13, x10]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v1.4h\n" + "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v20.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x10]\n" + "add x10, x10, #0x8\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v20.4s, v30.8h, v3.8h\n" + "smlal v18.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v20.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "sqrdmulh v20.4s, v20.4s, v17.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v5.4s, v5.4s, v17.4s\n" + "and v1.16b, v15.16b, v21.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v29.16b, v20.16b, v14.16b\n" + "and v3.16b, v18.16b, v21.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v2.16b, v5.16b, v14.16b\n" + "sqrdmulh v11.4s, v11.4s, v6.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v1.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "and v0.16b, v11.16b, v21.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "sqadd v5.4s, v5.4s, v2.4s\n" + "and v27.16b, v8.16b, v14.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "add v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v21.4s\n" + "srshl v5.4s, v5.4s, v14.4s\n" + "smin v15.4s, v15.4s, v12.4s\n" + "add v20.4s, v20.4s, v19.4s\n" + "add v18.4s, v18.4s, v19.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smin v18.4s, v18.4s, v12.4s\n" + "add v5.4s, v5.4s, v19.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smin v5.4s, v5.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "sqadd v11.4s, v11.4s, v0.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x17, x1]\n" + "smax v5.4s, v5.4s, v16.4s\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v21.4s\n" + "and v30.16b, v10.16b, v21.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v5.16b\n" + "add v11.4s, v11.4s, v19.4s\n" + "srshl v8.4s, v8.4s, v14.4s\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d18, [x16, x1]\n" + "smin v11.4s, v11.4s, v12.4s\n" + "sqrdmulh v9.4s, v9.4s, v17.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "sqadd v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "smin v8.4s, v8.4s, v12.4s\n" + "and v6.16b, v9.16b, v14.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "smax v8.4s, v8.4s, v16.4s\n" + "srshl v10.4s, v10.4s, v21.4s\n" + "uzp1 v11.16b, v11.16b, v8.16b\n" + "add v10.4s, v10.4s, v19.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x6, x1]\n" + "smin v10.4s, v10.4s, v12.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "srshl v9.4s, v9.4s, v14.4s\n" + "add v9.4s, v9.4s, v19.4s\n" + "smin v9.4s, v9.4s, v12.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "str d10, [x8, x1]\n" + "add x1, x1, #0x8\n" + "beq 124f\n" + "add x3, x3, #0xc8\n" + "3:" // Oddments + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x4, #2, 5f\n" + "ld1 { v15.4s }, [x12], #0x10\n" + "tbz x4, #1, 4f\n" + "ld1 { v20.d }[0], [x12], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v20.s }[2], [x12]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v20.s }[0], [x12]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x4, #1, 6f\n" + "ld1 { v15.d }[0], [x12], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[2], [x12]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[0], [x12]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v18.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "mov v5.16b, v20.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v11.16b, v15.16b\n" + "ldr d2, [x3, #0x10]\n" + "mov v8.16b, v20.16b\n" + "ldr d3, [x3, #0x18]\n" + "mov v10.16b, v15.16b\n" + "ldr d4, [x3, #0x20]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "mov v9.16b, v20.16b\n" + "ldp x28, x27, [x25, #0x0]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "add x28, x28, x10\n" + "add x27, x27, x10\n" + "add x26, x26, x10\n" + "add x13, x13, x10\n" + "add x24, x24, x10\n" + "add x23, x23, x10\n" + "add x22, x22, x10\n" + "add x21, x21, x10\n" + "add x20, x20, x10\n" + "add x0, x0, x10\n" + "tbz x4, #2, 9f\n" + "ld1 { v31.s }[0], [x28], #0x4\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v29.s }[0], [x26], #0x4\n" + "ld1 { v28.s }[0], [x13], #0x4\n" + "ld1 { v27.s }[0], [x24], #0x4\n" + "ld1 { v23.s }[0], [x23], #0x4\n" + "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v24.s }[0], [x21], #0x4\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "ld1 { v22.s }[0], [x0], #0x4\n" + "tbz x4, #1, 8f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x13], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v24.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v22.h }[2], [x0], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[6], [x28]\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x13]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v24.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" + "ld1 { v22.b }[6], [x0]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[4], [x28]\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x13]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v24.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" + "ld1 { v22.b }[4], [x0]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x4, #1, 10f\n" + "ld1 { v31.h }[0], [x28], #0x2\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v29.h }[0], [x26], #0x2\n" + "ld1 { v28.h }[0], [x13], #0x2\n" + "ld1 { v27.h }[0], [x24], #0x2\n" + "ld1 { v23.h }[0], [x23], #0x2\n" + "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v24.h }[0], [x21], #0x2\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "ld1 { v22.h }[0], [x0], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[2], [x28]\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x13]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v24.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" + "ld1 { v22.b }[2], [x0]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[0], [x28]\n" + "ld1 { v30.b }[0], [x27]\n" + "ld1 { v29.b }[0], [x26]\n" + "ld1 { v28.b }[0], [x13]\n" + "ld1 { v27.b }[0], [x24]\n" + "ld1 { v23.b }[0], [x23]\n" + "ld1 { v25.b }[0], [x22]\n" + "ld1 { v24.b }[0], [x21]\n" + "ld1 { v26.b }[0], [x20]\n" + "ld1 { v22.b }[0], [x0]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ldr x20, [x25, #0x50]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ssubl v29.8h, v29.8b, v7.8b\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v28.4h, v0.4h\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ssubl v22.8h, v22.8b, v7.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "add x20, x20, x10\n" + "smlal v18.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "smlal2 v9.4s, v23.8h, v1.8h\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "tbz x4, #2, 13f\n" + "ld1 { v31.s }[0], [x20], #0x4\n" + "tbz x4, #1, 12f\n" + "ld1 { v31.h }[2], [x20], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[6], [x20]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[4], [x20]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x4, #1, 14f\n" + "ld1 { v31.h }[0], [x20], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[2], [x20]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[0], [x20]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr x28, [x25, #0x58]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "add x28, x28, x10\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "smlal2 v9.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "tbz x4, #2, 17f\n" + "ld1 { v30.s }[0], [x28], #0x4\n" + "tbz x4, #1, 16f\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[6], [x28]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[4], [x28]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x4, #1, 18f\n" + "ld1 { v30.h }[0], [x28], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[2], [x28]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[0], [x28]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "smlal v15.4s, v24.4h, v4.4h\n" + "ldr x0, [x25, #0x60]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "add x0, x0, x10\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v9.4s, v30.8h, v3.8h\n" + "tbz x4, #2, 21f\n" + "ld1 { v27.s }[0], [x0], #0x4\n" + "tbz x4, #1, 20f\n" + "ld1 { v27.h }[2], [x0], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[6], [x0]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[4], [x0]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 5): Bit 2: Unset + "tbz x4, #1, 22f\n" + "ld1 { v27.h }[0], [x0], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[2], [x0]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[0], [x0]\n" + "23:" // Oddments: Load (0, 5): Bit 2: End + "smlal v11.4s, v30.4h, v4.4h\n" + "ldr d0, [x3, #0x28]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "ldr x7, [x25, #0x68]\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "add x7, x7, x10\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v20.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "tbz x4, #2, 25f\n" + "ld1 { v25.s }[0], [x7], #0x4\n" + "tbz x4, #1, 24f\n" + "ld1 { v25.h }[2], [x7], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[6], [x7]\n" + "b 27f\n" + "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[4], [x7]\n" + "b 27f\n" + "25:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x4, #1, 26f\n" + "ld1 { v25.h }[0], [x7], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[2], [x7]\n" + "b 27f\n" + "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[0], [x7]\n" + "27:" // Oddments: Load (2, 1): Bit 2: End + "ldr d1, [x3, #0x30]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "ldr x26, [x25, #0x70]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v25.8h, v0.8h\n" + "add x26, x26, x10\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "smlal v18.4s, v23.4h, v1.4h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "tbz x4, #2, 29f\n" + "ld1 { v24.s }[0], [x26], #0x4\n" + "tbz x4, #1, 28f\n" + "ld1 { v24.h }[2], [x26], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[6], [x26]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[4], [x26]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x4, #1, 30f\n" + "ld1 { v24.h }[0], [x26], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[2], [x26]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[0], [x26]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "ldr d2, [x3, #0x38]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "ldr x23, [x25, #0x78]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v1.8h\n" + "add x23, x23, x10\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v20.4s, v23.8h, v2.8h\n" + "smlal v18.4s, v31.4h, v2.4h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "tbz x4, #2, 33f\n" + "ld1 { v27.s }[0], [x23], #0x4\n" + "tbz x4, #1, 32f\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[6], [x23]\n" + "b 35f\n" + "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[4], [x23]\n" + "b 35f\n" + "33:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x4, #1, 34f\n" + "ld1 { v27.h }[0], [x23], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[2], [x23]\n" + "b 35f\n" + "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[0], [x23]\n" + "35:" // Oddments: Load (2, 3): Bit 2: End + "ldr d3, [x3, #0x40]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "ldr x20, [x25, #0x80]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v27.8h, v2.8h\n" + "add x20, x20, x10\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v20.4s, v31.8h, v3.8h\n" + "smlal v18.4s, v30.4h, v3.4h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "tbz x4, #2, 37f\n" + "ld1 { v23.s }[0], [x20], #0x4\n" + "tbz x4, #1, 36f\n" + "ld1 { v23.h }[2], [x20], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[6], [x20]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[4], [x20]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x4, #1, 38f\n" + "ld1 { v23.h }[0], [x20], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[2], [x20]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[0], [x20]\n" + "39:" // Oddments: Load (2, 4): Bit 2: End + "ldr d4, [x3, #0x48]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v10.4s, v23.4h, v3.4h\n" + "ldr x22, [x25, #0x88]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v23.8h, v3.8h\n" + "add x22, x22, x10\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "tbz x4, #2, 41f\n" + "ld1 { v28.s }[0], [x22], #0x4\n" + "tbz x4, #1, 40f\n" + "ld1 { v28.h }[2], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 5): Bit 2: Unset + "tbz x4, #1, 42f\n" + "ld1 { v28.h }[0], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 5): Bit 2: End + "ldr d0, [x3, #0x50]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr x13, [x25, #0x90]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "add x13, x13, x10\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v20.4s, v22.8h, v0.8h\n" + "smlal v18.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 45f\n" + "ld1 { v31.s }[0], [x13], #0x4\n" + "tbz x4, #1, 44f\n" + "ld1 { v31.h }[2], [x13], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[6], [x13]\n" + "b 47f\n" + "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[4], [x13]\n" + "b 47f\n" + "45:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x4, #1, 46f\n" + "ld1 { v31.h }[0], [x13], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[2], [x13]\n" + "b 47f\n" + "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[0], [x13]\n" + "47:" // Oddments: Load (3, 0): Bit 2: End + "ldr x21, [x25, #0x98]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "add x21, x21, x10\n" + "tbz x4, #2, 49f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x4, #1, 48f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x4, #1, 50f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[0], [x21]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "ldr d1, [x3, #0x58]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v10.4s, v30.4h, v0.4h\n" + "ldr x14, [x25, #0xa0]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v30.8h, v0.8h\n" + "add x14, x14, x10\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v20.4s, v25.8h, v1.8h\n" + "smlal v18.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "tbz x4, #2, 53f\n" + "ld1 { v26.s }[0], [x14], #0x4\n" + "tbz x4, #1, 52f\n" + "ld1 { v26.h }[2], [x14], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[6], [x14]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[4], [x14]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x4, #1, 54f\n" + "ld1 { v26.h }[0], [x14], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[2], [x14]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[0], [x14]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "ldr d2, [x3, #0x60]\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal v10.4s, v26.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v26.8h, v1.8h\n" + "add x11, x11, x10\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v20.4s, v24.8h, v2.8h\n" + "smlal v18.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "tbz x4, #2, 57f\n" + "ld1 { v25.s }[0], [x11], #0x4\n" + "tbz x4, #1, 56f\n" + "ld1 { v25.h }[2], [x11], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[6], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[4], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x4, #1, 58f\n" + "ld1 { v25.h }[0], [x11], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[0], [x11]\n" + "59:" // Oddments: Load (3, 3): Bit 2: End + "ldr d3, [x3, #0x68]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "add x24, x24, x10\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v18.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 61f\n" + "ld1 { v24.s }[0], [x24], #0x4\n" + "tbz x4, #1, 60f\n" + "ld1 { v24.h }[2], [x24], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[6], [x24]\n" + "b 63f\n" + "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[4], [x24]\n" + "b 63f\n" + "61:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x4, #1, 62f\n" + "ld1 { v24.h }[0], [x24], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[2], [x24]\n" + "b 63f\n" + "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[0], [x24]\n" + "63:" // Oddments: Load (3, 4): Bit 2: End + "ldr d4, [x3, #0x70]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "ldr x0, [x25, #0xb8]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "add x0, x0, x10\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v20.4s, v23.8h, v4.8h\n" + "smlal v18.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 65f\n" + "ld1 { v22.s }[0], [x0], #0x4\n" + "tbz x4, #1, 64f\n" + "ld1 { v22.h }[2], [x0], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[6], [x0]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[4], [x0]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 5): Bit 2: Unset + "tbz x4, #1, 66f\n" + "ld1 { v22.h }[0], [x0], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[2], [x0]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[0], [x0]\n" + "67:" // Oddments: Load (3, 5): Bit 2: End + "ldr d0, [x3, #0x78]\n" + "ssubl v22.8h, v22.8b, v7.8b\n" + "smlal v10.4s, v22.4h, v4.4h\n" + "ldr x15, [x25, #0xc0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v22.8h, v4.8h\n" + "add x15, x15, x10\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "tbz x4, #2, 69f\n" + "ld1 { v27.s }[0], [x15], #0x4\n" + "tbz x4, #1, 68f\n" + "ld1 { v27.h }[2], [x15], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[6], [x15]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[4], [x15]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x4, #1, 70f\n" + "ld1 { v27.h }[0], [x15], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[2], [x15]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[0], [x15]\n" + "71:" // Oddments: Load (4, 0): Bit 2: End + "ldr x9, [x25, #0xc8]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "add x9, x9, x10\n" + "tbz x4, #2, 73f\n" + "ld1 { v23.s }[0], [x9], #0x4\n" + "tbz x4, #1, 72f\n" + "ld1 { v23.h }[2], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[6], [x9]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[4], [x9]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x4, #1, 74f\n" + "ld1 { v23.h }[0], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[2], [x9]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[0], [x9]\n" + "75:" // Oddments: Load (4, 1): Bit 2: End + "ldr d1, [x3, #0x80]\n" + "ssubl v23.8h, v23.8b, v7.8b\n" + "smlal v10.4s, v23.4h, v0.4h\n" + "ldr x27, [x25, #0xd0]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v23.8h, v0.8h\n" + "add x27, x27, x10\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "smlal v18.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "tbz x4, #2, 77f\n" + "ld1 { v31.s }[0], [x27], #0x4\n" + "tbz x4, #1, 76f\n" + "ld1 { v31.h }[2], [x27], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[6], [x27]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[4], [x27]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x4, #1, 78f\n" + "ld1 { v31.h }[0], [x27], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[2], [x27]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[0], [x27]\n" + "79:" // Oddments: Load (4, 2): Bit 2: End + "ldr d2, [x3, #0x88]\n" + "ssubl v31.8h, v31.8b, v7.8b\n" + "smlal v10.4s, v31.4h, v1.4h\n" + "ldr x28, [x25, #0xd8]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "add x28, x28, x10\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v20.4s, v26.8h, v2.8h\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "tbz x4, #2, 81f\n" + "ld1 { v30.s }[0], [x28], #0x4\n" + "tbz x4, #1, 80f\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[6], [x28]\n" + "b 83f\n" + "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[4], [x28]\n" + "b 83f\n" + "81:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x4, #1, 82f\n" + "ld1 { v30.h }[0], [x28], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[2], [x28]\n" + "b 83f\n" + "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[0], [x28]\n" + "83:" // Oddments: Load (4, 3): Bit 2: End + "ldr d3, [x3, #0x90]\n" + "ssubl v30.8h, v30.8b, v7.8b\n" + "smlal v10.4s, v30.4h, v2.4h\n" + "ldr x12, [x25, #0xe0]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v30.8h, v2.8h\n" + "add x12, x12, x10\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "tbz x4, #2, 85f\n" + "ld1 { v28.s }[0], [x12], #0x4\n" + "tbz x4, #1, 84f\n" + "ld1 { v28.h }[2], [x12], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[6], [x12]\n" + "b 87f\n" + "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[4], [x12]\n" + "b 87f\n" + "85:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x4, #1, 86f\n" + "ld1 { v28.h }[0], [x12], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[2], [x12]\n" + "b 87f\n" + "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[0], [x12]\n" + "87:" // Oddments: Load (4, 4): Bit 2: End + "ldr d4, [x3, #0x98]\n" + "ssubl v28.8h, v28.8b, v7.8b\n" + "smlal v10.4s, v28.4h, v3.4h\n" + "ldr x7, [x25, #0xe8]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v28.8h, v3.8h\n" + "add x7, x7, x10\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "smlal v18.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "tbz x4, #2, 89f\n" + "ld1 { v26.s }[0], [x7], #0x4\n" + "tbz x4, #1, 88f\n" + "ld1 { v26.h }[2], [x7], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[6], [x7]\n" + "b 91f\n" + "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[4], [x7]\n" + "b 91f\n" + "89:" // Oddments: Load (4, 5): Bit 2: Unset + "tbz x4, #1, 90f\n" + "ld1 { v26.h }[0], [x7], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[2], [x7]\n" + "b 91f\n" + "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[0], [x7]\n" + "91:" // Oddments: Load (4, 5): Bit 2: End + "ldr d0, [x3, #0xa0]\n" + "ssubl v26.8h, v26.8b, v7.8b\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "add x26, x26, x10\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v20.4s, v27.8h, v0.8h\n" + "smlal v18.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "tbz x4, #2, 93f\n" + "ld1 { v25.s }[0], [x26], #0x4\n" + "tbz x4, #1, 92f\n" + "ld1 { v25.h }[2], [x26], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[6], [x26]\n" + "b 95f\n" + "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[4], [x26]\n" + "b 95f\n" + "93:" // Oddments: Load (5, 0): Bit 2: Unset + "tbz x4, #1, 94f\n" + "ld1 { v25.h }[0], [x26], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[2], [x26]\n" + "b 95f\n" + "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[0], [x26]\n" + "95:" // Oddments: Load (5, 0): Bit 2: End + "ldr x23, [x25, #0xf8]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "add x23, x23, x10\n" + "tbz x4, #2, 97f\n" + "ld1 { v24.s }[0], [x23], #0x4\n" + "tbz x4, #1, 96f\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[6], [x23]\n" + "b 99f\n" + "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[4], [x23]\n" + "b 99f\n" + "97:" // Oddments: Load (5, 1): Bit 2: Unset + "tbz x4, #1, 98f\n" + "ld1 { v24.h }[0], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[2], [x23]\n" + "b 99f\n" + "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[0], [x23]\n" + "99:" // Oddments: Load (5, 1): Bit 2: End + "ldr d1, [x3, #0xa8]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v0.4h\n" + "ldr x22, [x25, #0x100]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v0.8h\n" + "add x22, x22, x10\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v20.4s, v23.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "tbz x4, #2, 101f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x4, #1, 100f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[6], [x22]\n" + "b 103f\n" + "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[4], [x22]\n" + "b 103f\n" + "101:" // Oddments: Load (5, 2): Bit 2: Unset + "tbz x4, #1, 102f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[2], [x22]\n" + "b 103f\n" + "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[0], [x22]\n" + "103:" // Oddments: Load (5, 2): Bit 2: End + "ldr d2, [x3, #0xb0]\n" + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v1.4h\n" + "ldr x20, [x25, #0x108]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v27.8h, v1.8h\n" + "add x20, x20, x10\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v20.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "tbz x4, #2, 105f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x4, #1, 104f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 107f\n" + "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 107f\n" + "105:" // Oddments: Load (5, 3): Bit 2: Unset + "tbz x4, #1, 106f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 107f\n" + "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[0], [x20]\n" + "107:" // Oddments: Load (5, 3): Bit 2: End + "ldr d3, [x3, #0xb8]\n" + "ssubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "ldr x13, [x25, #0x110]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "add x13, x13, x10\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v20.4s, v30.8h, v3.8h\n" + "smlal v18.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 109f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "tbz x4, #1, 108f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[6], [x13]\n" + "b 111f\n" + "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[4], [x13]\n" + "b 111f\n" + "109:" // Oddments: Load (5, 4): Bit 2: Unset + "tbz x4, #1, 110f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[2], [x13]\n" + "b 111f\n" + "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[0], [x13]\n" + "111:" // Oddments: Load (5, 4): Bit 2: End + "ldr d4, [x3, #0xc0]\n" + "ssubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "ldr x21, [x25, #0x118]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "add x21, x21, x10\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v20.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 113f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x4, #1, 112f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[6], [x21]\n" + "b 115f\n" + "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[4], [x21]\n" + "b 115f\n" + "113:" // Oddments: Load (5, 5): Bit 2: Unset + "tbz x4, #1, 114f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[2], [x21]\n" + "b 115f\n" + "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[0], [x21]\n" + "115:" // Oddments: Load (5, 5): Bit 2: End + "ssubl v27.8h, v27.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "tbz x4, #2, 117f\n" + "ld1 { v6.4s }, [x2], #0x10\n" + "ld1 { v21.4s }, [x5], #0x10\n" + "tbz x4, #1, 116f\n" + "ld1 { v17.d }[0], [x2], #0x8\n" + "ld1 { v14.d }[0], [x5], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v17.s }[2], [x2]\n" + "ld1 { v14.s }[2], [x5]\n" + "b 119f\n" + "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v17.s }[0], [x2]\n" + "ld1 { v14.s }[0], [x5]\n" + "b 119f\n" + "117:" // Oddments: Load requant params: Bit 2: Unset + "tbz x4, #1, 118f\n" + "ld1 { v6.d }[0], [x2], #0x8\n" + "ld1 { v21.d }[0], [x5], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v6.s }[2], [x2]\n" + "ld1 { v21.s }[2], [x5]\n" + "b 119f\n" + "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v6.s }[0], [x2]\n" + "ld1 { v21.s }[0], [x5]\n" + "119:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "add x17, x17, x1\n" + "sqrdmulh v20.4s, v20.4s, v17.4s\n" + "add x16, x16, x1\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "add x6, x6, x1\n" + "sqrdmulh v5.4s, v5.4s, v17.4s\n" + "add x8, x8, x1\n" + "sqrdmulh v11.4s, v11.4s, v6.4s\n" + "and v1.16b, v15.16b, v21.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v29.16b, v20.16b, v14.16b\n" + "and v3.16b, v18.16b, v21.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v2.16b, v5.16b, v14.16b\n" + "and v0.16b, v11.16b, v21.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v1.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "sqrdmulh v9.4s, v9.4s, v17.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "srshl v15.4s, v15.4s, v21.4s\n" + "sqadd v5.4s, v5.4s, v2.4s\n" + "srshl v20.4s, v20.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v21.4s\n" + "add v15.4s, v15.4s, v19.4s\n" + "srshl v5.4s, v5.4s, v14.4s\n" + "add v20.4s, v20.4s, v19.4s\n" + "smin v15.4s, v15.4s, v12.4s\n" + "add v18.4s, v18.4s, v19.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smin v18.4s, v18.4s, v12.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "add v5.4s, v5.4s, v19.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "smin v5.4s, v5.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "sqadd v11.4s, v11.4s, v0.4s\n" + "smax v5.4s, v5.4s, v16.4s\n" + "and v27.16b, v8.16b, v14.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v5.16b\n" + "srshl v11.4s, v11.4s, v21.4s\n" + "and v30.16b, v10.16b, v21.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "add v11.4s, v11.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "and v6.16b, v9.16b, v14.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "smin v11.4s, v11.4s, v12.4s\n" + "srshl v8.4s, v8.4s, v14.4s\n" + "sqadd v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "srshl v10.4s, v10.4s, v21.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "smin v8.4s, v8.4s, v12.4s\n" + "add v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v14.4s\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smin v10.4s, v10.4s, v12.4s\n" + "uzp1 v11.16b, v11.16b, v8.16b\n" + "add v9.4s, v9.4s, v19.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smin v9.4s, v9.4s, v12.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "tbz x4, #2, 121f\n" + "st1 { v15.s }[0], [x17], #0x4\n" + "st1 { v18.s }[0], [x16], #0x4\n" + "st1 { v11.s }[0], [x6], #0x4\n" + "st1 { v10.s }[0], [x8], #0x4\n" + "tbz x4, #1, 120f\n" + "st1 { v15.h }[2], [x17], #0x2\n" + "st1 { v18.h }[2], [x16], #0x2\n" + "st1 { v11.h }[2], [x6], #0x2\n" + "st1 { v10.h }[2], [x8], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v15.b }[6], [x17], #0x1\n" + "st1 { v18.b }[6], [x16], #0x1\n" + "st1 { v11.b }[6], [x6], #0x1\n" + "st1 { v10.b }[6], [x8], #0x1\n" + "b 123f\n" + "120:" // Oddments: Bit 2: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v15.b }[4], [x17], #0x1\n" + "st1 { v18.b }[4], [x16], #0x1\n" + "st1 { v11.b }[4], [x6], #0x1\n" + "st1 { v10.b }[4], [x8], #0x1\n" + "b 123f\n" + "121:" // Oddments: Bit 2: Unset + "tbz x4, #1, 122f\n" + "st1 { v15.h }[0], [x17], #0x2\n" + "st1 { v18.h }[0], [x16], #0x2\n" + "st1 { v11.h }[0], [x6], #0x2\n" + "st1 { v10.h }[0], [x8], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v15.b }[2], [x17], #0x1\n" + "st1 { v18.b }[2], [x16], #0x1\n" + "st1 { v11.b }[2], [x6], #0x1\n" + "st1 { v10.b }[2], [x8], #0x1\n" + "b 123f\n" + "122:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v15.b }[0], [x17], #0x1\n" + "st1 { v18.b }[0], [x16], #0x1\n" + "st1 { v11.b }[0], [x6], #0x1\n" + "st1 { v10.b }[0], [x8], #0x1\n" + "123:" // Oddments: Bit 2: End + + "124:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp new file mode 100644 index 0000000000..4e845cceaf --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); + +struct a64_s8q_nhwc_generic_output9_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int n_output_points = 9; + + kern_type kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl; + + a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..ad5545a304 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl( + const int8_t *const *const inptrs, + int8_t *const *const outptrs, + const void *params, + const arm_gemm::Requantize32& qp, + const unsigned int n_points, + const unsigned int n_channels +) +{ + __asm__ __volatile__( + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v12.4s }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" + "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n" + "ld1r { v11.4s }, [x20]\n" + "ld1r { v10.16b }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v9.16b }, [x20]\n" + "ld1r { v8.4s }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n" + "ld1r { v7.4s }, [x20]\n" + "ld1r { v6.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n" + "mov x11, #0x0\n" + "ld1r { v5.4s }, [x19]\n" + "lsr x10, %x[n_channels], #0x2\n" + "cbz x10, 6f\n" + "1:" // Channel loop + "movi v27.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x11, #0x2\n" + "ldr q27, [%x[bias], x19]\n" + "2:" // Channel loop: Load bias: Done + "mov v26.16b, v27.16b\n" + "ldr s16, [%x[params]], #0x4\n" + "mov x20, %x[inptrs]\n" + "mov v25.16b, v27.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, %x[n_points], #0x1\n" + "mov v24.16b, v27.16b\n" + "ldr s4, [x9, x11]\n" + "mov v23.16b, v27.16b\n" + "mov v22.16b, v27.16b\n" + "ldr s3, [x28, x11]\n" + "mov v21.16b, v27.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v20.16b, v27.16b\n" + "ldr s2, [x27, x11]\n" + "mov v19.16b, v27.16b\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "ldr s1, [x26, x11]\n" + "ssubl v4.8h, v4.8b, v10.8b\n" + "ldp x25, x24, [x20], #0x10\n" + "ssubl v3.8h, v3.8b, v10.8b\n" + "ldr s0, [x25, x11]\n" + "ssubl v2.8h, v2.8b, v10.8b\n" + "ssubl v1.8h, v1.8b, v10.8b\n" + "ldr s31, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "ssubl v0.8h, v0.8b, v10.8b\n" + "ldr s30, [x23, x11]\n" + "ldr s29, [x22, x11]\n" + "ssubl v31.8h, v31.8b, v10.8b\n" + "ldr x21, [x20], #0x8\n" + "ssubl v30.8h, v30.8b, v10.8b\n" + "ldr s28, [x21, x11]\n" + "ssubl v29.8h, v29.8b, v10.8b\n" + "ssubl v28.8h, v28.8b, v10.8b\n" + "ble 4f\n" + "3:" // Channel loop: Planar loop + "smlal v27.4s, v4.4h, v16.4h\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, x19, #0x1\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "ldr s4, [x9, x11]\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "ldr s3, [x28, x11]\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "ldp x27, x26, [x20], #0x10\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "ldr s2, [x27, x11]\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "ldr s16, [%x[params]], #0x4\n" + "ssubl v4.8h, v4.8b, v10.8b\n" + "ldr s1, [x26, x11]\n" + "ssubl v3.8h, v3.8b, v10.8b\n" + "ldp x25, x24, [x20], #0x10\n" + "ssubl v2.8h, v2.8b, v10.8b\n" + "ldr s0, [x25, x11]\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v10.8b\n" + "ldr s31, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "ssubl v0.8h, v0.8b, v10.8b\n" + "ldr s30, [x23, x11]\n" + "ldr s29, [x22, x11]\n" + "ssubl v31.8h, v31.8b, v10.8b\n" + "ldr x21, [x20], #0x8\n" + "ssubl v30.8h, v30.8b, v10.8b\n" + "ldr s28, [x21, x11]\n" + "ssubl v29.8h, v29.8b, v10.8b\n" + "ssubl v28.8h, v28.8b, v10.8b\n" + "bgt 3b\n" + "4:" // Channel loop: Planar tail + "smlal v27.4s, v4.4h, v16.4h\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "cbz %x[rq_mul_ptr], 5f\n" + "lsl x19, x11, #0x2\n" + "ldr q6, [%x[rq_mul_ptr], x19]\n" + "ldr q5, [%x[rq_right_shift_ptr], x19]\n" + "cbz %x[rq_left_shift_ptr], 5f\n" + "ldr q7, [%x[rq_left_shift_ptr], x19]\n" + "5:" // Channel loop: Load quantisation parameters: Done + "sshl v27.4s, v27.4s, v7.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "sshl v26.4s, v26.4s, v7.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "sshl v25.4s, v25.4s, v7.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "sqrdmulh v27.4s, v27.4s, v6.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "sqrdmulh v25.4s, v25.4s, v6.4s\n" + "sshl v24.4s, v24.4s, v7.4s\n" + "and v16.16b, v27.16b, v5.16b\n" + "and v18.16b, v26.16b, v5.16b\n" + "and v17.16b, v25.16b, v5.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sqadd v26.4s, v26.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v17.4s\n" + "sqrdmulh v24.4s, v24.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v5.4s\n" + "srshl v26.4s, v26.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v5.4s\n" + "and v16.16b, v24.16b, v5.16b\n" + "add v27.4s, v27.4s, v8.4s\n" + "add v26.4s, v26.4s, v8.4s\n" + "add v25.4s, v25.4s, v8.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v27.4s, v27.4s, v12.4s\n" + "smax v26.4s, v26.4s, v12.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "smin v27.4s, v27.4s, v11.4s\n" + "smin v26.4s, v26.4s, v11.4s\n" + "smax v25.4s, v25.4s, v12.4s\n" + "srshl v24.4s, v24.4s, v5.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "smin v25.4s, v25.4s, v11.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x27, x11]\n" + "add v24.4s, v24.4s, v8.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x26, x11]\n" + "smax v24.4s, v24.4s, v12.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x25, x11]\n" + "sshl v23.4s, v23.4s, v7.4s\n" + "sshl v22.4s, v22.4s, v7.4s\n" + "smin v24.4s, v24.4s, v11.4s\n" + "sqrdmulh v23.4s, v23.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sshl v21.4s, v21.4s, v7.4s\n" + "and v17.16b, v23.16b, v5.16b\n" + "and v16.16b, v22.16b, v5.16b\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x24, x11]\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v21.16b, v5.16b\n" + "sshl v20.4s, v20.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v7.4s\n" + "srshl v23.4s, v23.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v5.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v6.4s\n" + "add v23.4s, v23.4s, v8.4s\n" + "add v22.4s, v22.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "and v17.16b, v20.16b, v5.16b\n" + "sqrdmulh v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v5.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v19.16b, v5.16b\n" + "smin v23.4s, v23.4s, v11.4s\n" + "add v21.4s, v21.4s, v8.4s\n" + "sqadd v20.4s, v20.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v22.4s, v22.4s, v12.4s\n" + "smax v21.4s, v21.4s, v12.4s\n" + "srshl v20.4s, v20.4s, v5.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "smin v21.4s, v21.4s, v11.4s\n" + "add v20.4s, v20.4s, v8.4s\n" + "srshl v19.4s, v19.4s, v5.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x23, x11]\n" + "add v19.4s, v19.4s, v8.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x22, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x21, x11]\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x20, x11]\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x19, x11]\n" + "add x11, x11, #0x4\n" + "cmp x11, x10, LSL #2\n" + "blt 1b\n" + "6:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 24f\n" + "movi v27.4s, #0x0\n" + "cbz %x[bias], 9f\n" + "add x19, %x[bias], x11, LSL #2\n" + "tbz %x[n_channels], #1, 7f\n" + "ld1 { v27.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v27.s }[2], [x19], #0x4\n" + "b 8f\n" + "7:" // Oddments: Load bias: Bit 1: Unset + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "8:" // Oddments: Load bias: Bit 1: End + + "9:" // Oddments: Load bias: Done + "mov v26.16b, v27.16b\n" + "ldr s16, [%x[params]], #0x4\n" + "mov x20, %x[inptrs]\n" + "mov v25.16b, v27.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "mov v24.16b, v27.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v23.16b, v27.16b\n" + "ldp x25, x24, [x20], #0x10\n" + "mov v22.16b, v27.16b\n" + "add x28, x28, x11\n" + "mov v21.16b, v27.16b\n" + "ldp x23, x22, [x20], #0x10\n" + "mov v20.16b, v27.16b\n" + "add x27, x27, x11\n" + "mov v19.16b, v27.16b\n" + "ldr x21, [x20], #0x8\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "add x26, x26, x11\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 10f\n" + "ldr h4, [x9], #0x2\n" + "ldr h3, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" + "ldr h31, [x24], #0x2\n" + "ldr h30, [x23], #0x2\n" + "ldr h29, [x22], #0x2\n" + "ldr h28, [x21], #0x2\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v4.b }[2], [x9], #0x1\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v2.b }[2], [x27], #0x1\n" + "ld1 { v1.b }[2], [x26], #0x1\n" + "ld1 { v0.b }[2], [x25], #0x1\n" + "ld1 { v31.b }[2], [x24], #0x1\n" + "ld1 { v30.b }[2], [x23], #0x1\n" + "ld1 { v29.b }[2], [x22], #0x1\n" + "ld1 { v28.b }[2], [x21], #0x1\n" + "b 11f\n" + "10:" // Oddments: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 11f\n" + "ldr b4, [x9], #0x1\n" + "ldr b3, [x28], #0x1\n" + "ldr b2, [x27], #0x1\n" + "ldr b1, [x26], #0x1\n" + "ldr b0, [x25], #0x1\n" + "ldr b31, [x24], #0x1\n" + "ldr b30, [x23], #0x1\n" + "ldr b29, [x22], #0x1\n" + "ldr b28, [x21], #0x1\n" + "11:" // Oddments: Load: Bit 1: End + "ssubl v4.8h, v4.8b, v10.8b\n" + "subs x19, %x[n_points], #0x1\n" + "ssubl v3.8h, v3.8b, v10.8b\n" + "ssubl v2.8h, v2.8b, v10.8b\n" + "ssubl v1.8h, v1.8b, v10.8b\n" + "ssubl v0.8h, v0.8b, v10.8b\n" + "ssubl v31.8h, v31.8b, v10.8b\n" + "ssubl v30.8h, v30.8b, v10.8b\n" + "ssubl v29.8h, v29.8b, v10.8b\n" + "ssubl v28.8h, v28.8b, v10.8b\n" + "ble 15f\n" + "12:" // Oddments: Planar loop + "smlal v27.4s, v4.4h, v16.4h\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "ldp x27, x26, [x20], #0x10\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "ldp x25, x24, [x20], #0x10\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "add x28, x28, x11\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "ldp x23, x22, [x20], #0x10\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "add x27, x27, x11\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "ldr x21, [x20], #0x8\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "add x26, x26, x11\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "ldr s16, [%x[params]], #0x4\n" + "add x25, x25, x11\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr h4, [x9], #0x2\n" + "ldr h3, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" + "ldr h31, [x24], #0x2\n" + "ldr h30, [x23], #0x2\n" + "ldr h29, [x22], #0x2\n" + "ldr h28, [x21], #0x2\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v4.b }[2], [x9], #0x1\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v2.b }[2], [x27], #0x1\n" + "ld1 { v1.b }[2], [x26], #0x1\n" + "ld1 { v0.b }[2], [x25], #0x1\n" + "ld1 { v31.b }[2], [x24], #0x1\n" + "ld1 { v30.b }[2], [x23], #0x1\n" + "ld1 { v29.b }[2], [x22], #0x1\n" + "ld1 { v28.b }[2], [x21], #0x1\n" + "b 14f\n" + "13:" // Oddments: Planar loop: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 14f\n" + "ldr b4, [x9], #0x1\n" + "ldr b3, [x28], #0x1\n" + "ldr b2, [x27], #0x1\n" + "ldr b1, [x26], #0x1\n" + "ldr b0, [x25], #0x1\n" + "ldr b31, [x24], #0x1\n" + "ldr b30, [x23], #0x1\n" + "ldr b29, [x22], #0x1\n" + "ldr b28, [x21], #0x1\n" + "14:" // Oddments: Planar loop: Load: Bit 1: End + "ssubl v4.8h, v4.8b, v10.8b\n" + "subs x19, x19, #0x1\n" + "ssubl v3.8h, v3.8b, v10.8b\n" + "ssubl v2.8h, v2.8b, v10.8b\n" + "ssubl v1.8h, v1.8b, v10.8b\n" + "ssubl v0.8h, v0.8b, v10.8b\n" + "ssubl v31.8h, v31.8b, v10.8b\n" + "ssubl v30.8h, v30.8b, v10.8b\n" + "ssubl v29.8h, v29.8b, v10.8b\n" + "ssubl v28.8h, v28.8b, v10.8b\n" + "bgt 12b\n" + "15:" // Oddments: Planar tail + "smlal v27.4s, v4.4h, v16.4h\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "cbz %x[rq_mul_ptr], 21f\n" + "add x21, %x[rq_mul_ptr], x11, LSL #2\n" + "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n" + "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v6.d }[0], [x21], #0x8\n" + "ld1 { v5.d }[0], [x20], #0x8\n" + "cbz %x[rq_left_shift_ptr], 16f\n" + "ld1 { v7.d }[0], [x19], #0x8\n" + "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v6.s }[2], [x21], #0x4\n" + "ld1 { v5.s }[2], [x20], #0x4\n" + "cbz %x[rq_left_shift_ptr], 17f\n" + "ld1 { v7.s }[2], [x19], #0x4\n" + "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done + "b 20f\n" + "18:" // Oddments: Load quantisation parameters: Bit 1: Unset + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v6.s }[0], [x21], #0x4\n" + "ld1 { v5.s }[0], [x20], #0x4\n" + "cbz %x[rq_left_shift_ptr], 19f\n" + "ld1 { v7.s }[0], [x19], #0x4\n" + "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done + + "20:" // Oddments: Load quantisation parameters: Bit 1: End + + "21:" // Oddments: Load quantisation parameters: Done + "sshl v27.4s, v27.4s, v7.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "add x27, x27, x11\n" + "sqrdmulh v27.4s, v27.4s, v6.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "sshl v26.4s, v26.4s, v7.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "add x26, x26, x11\n" + "sshl v25.4s, v25.4s, v7.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "sshl v24.4s, v24.4s, v7.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x25, x25, x11\n" + "and v16.16b, v27.16b, v5.16b\n" + "add x24, x24, x11\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "add x23, x23, x11\n" + "sqrdmulh v25.4s, v25.4s, v6.4s\n" + "add x22, x22, x11\n" + "sqrdmulh v24.4s, v24.4s, v6.4s\n" + "add x21, x21, x11\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add x20, x20, x11\n" + "and v18.16b, v26.16b, v5.16b\n" + "add x19, x19, x11\n" + "and v17.16b, v25.16b, v5.16b\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v24.16b, v5.16b\n" + "srshl v27.4s, v27.4s, v5.4s\n" + "sqadd v26.4s, v26.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v27.4s, v27.4s, v8.4s\n" + "srshl v26.4s, v26.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v5.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "smax v27.4s, v27.4s, v12.4s\n" + "add v26.4s, v26.4s, v8.4s\n" + "add v25.4s, v25.4s, v8.4s\n" + "srshl v24.4s, v24.4s, v5.4s\n" + "smin v27.4s, v27.4s, v11.4s\n" + "smax v26.4s, v26.4s, v12.4s\n" + "smax v25.4s, v25.4s, v12.4s\n" + "add v24.4s, v24.4s, v8.4s\n" + "smin v26.4s, v26.4s, v11.4s\n" + "smin v25.4s, v25.4s, v11.4s\n" + "smax v24.4s, v24.4s, v12.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v24.4s, v24.4s, v11.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sshl v23.4s, v23.4s, v7.4s\n" + "sshl v22.4s, v22.4s, v7.4s\n" + "sqrdmulh v23.4s, v23.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sshl v21.4s, v21.4s, v7.4s\n" + "sshl v20.4s, v20.4s, v7.4s\n" + "and v17.16b, v23.16b, v5.16b\n" + "and v16.16b, v22.16b, v5.16b\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v6.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v21.16b, v5.16b\n" + "and v17.16b, v20.16b, v5.16b\n" + "srshl v23.4s, v23.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v5.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "add v23.4s, v23.4s, v8.4s\n" + "add v22.4s, v22.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "sqadd v20.4s, v20.4s, v17.4s\n" + "smax v23.4s, v23.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v5.4s\n" + "srshl v20.4s, v20.4s, v5.4s\n" + "smin v23.4s, v23.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "add v21.4s, v21.4s, v8.4s\n" + "add v20.4s, v20.4s, v8.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smax v21.4s, v21.4s, v12.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v21.4s, v21.4s, v11.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "sshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "sqrdmulh v19.4s, v19.4s, v6.4s\n" + "and v16.16b, v19.16b, v5.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "srshl v19.4s, v19.4s, v5.4s\n" + "add v19.4s, v19.4s, v8.4s\n" + "smax v19.4s, v19.4s, v12.4s\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "tbz %x[n_channels], #1, 22f\n" + "st1 { v27.h }[0], [x27], #0x2\n" + "st1 { v26.h }[0], [x26], #0x2\n" + "st1 { v25.h }[0], [x25], #0x2\n" + "st1 { v24.h }[0], [x24], #0x2\n" + "st1 { v23.h }[0], [x23], #0x2\n" + "st1 { v22.h }[0], [x22], #0x2\n" + "st1 { v21.h }[0], [x21], #0x2\n" + "st1 { v20.h }[0], [x20], #0x2\n" + "st1 { v19.h }[0], [x19], #0x2\n" + "tbz %x[n_channels], #0, 23f\n" + "st1 { v27.b }[2], [x27], #0x1\n" + "st1 { v26.b }[2], [x26], #0x1\n" + "st1 { v25.b }[2], [x25], #0x1\n" + "st1 { v24.b }[2], [x24], #0x1\n" + "st1 { v23.b }[2], [x23], #0x1\n" + "st1 { v22.b }[2], [x22], #0x1\n" + "st1 { v21.b }[2], [x21], #0x1\n" + "st1 { v20.b }[2], [x20], #0x1\n" + "st1 { v19.b }[2], [x19], #0x1\n" + "b 23f\n" + "22:" // Oddments: Store: Bit 1: Unset + "tbz %x[n_channels], #0, 23f\n" + "st1 { v27.b }[0], [x27], #0x1\n" + "st1 { v26.b }[0], [x26], #0x1\n" + "st1 { v25.b }[0], [x25], #0x1\n" + "st1 { v24.b }[0], [x24], #0x1\n" + "st1 { v23.b }[0], [x23], #0x1\n" + "st1 { v22.b }[0], [x22], #0x1\n" + "st1 { v21.b }[0], [x21], #0x1\n" + "st1 { v20.b }[0], [x20], #0x1\n" + "st1 { v19.b }[0], [x19], #0x1\n" + "23:" // Oddments: Store: Bit 1: End + + "24:" // End + + : [params] "+&r" (params) + : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp new file mode 100644 index 0000000000..b9fef4f9ab --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 9; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + + a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..2fb6d3538f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl( + const int8_t *const *const inptrs, + int8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "movi v5.16b, #0x1\n" + "ldr x22, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "ushr v5.4s, v5.4s, #0x8\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "movi v26.4s, #0x0\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "mov x11, #0x0\n" + "movi v1.4s, #0x0\n" + "ld1 { v15.16b }, [x22]\n" + "mov x10, #0x0\n" + "movi v22.4s, #0x0\n" + "ld1 { v29.16b }, [x20]\n" + "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "movi v25.4s, #0x0\n" + "ld1 { v0.16b }, [x19]\n" + "add x28, %x[qp], %[offsetof_Requantize32_minval]\n" + "movi v13.4s, #0x0\n" + "ldr x20, [%x[inptrs], #0x18]\n" + "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n" + "mov v20.16b, v15.16b\n" + "ldr x19, [%x[inptrs], #0x20]\n" + "cmp %x[n_channels], #0x4\n" + "ext v20.16b, v20.16b, v20.16b, #0x2\n" + "ld1r { v4.4s }, [x21]\n" + "mov v17.16b, v15.16b\n" + "ld1 { v2.16b }, [x20]\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "ld1 { v7.16b }, [x19]\n" + "mov v23.16b, v15.16b\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "ext v23.16b, v23.16b, v23.16b, #0x6\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "mov v18.16b, v29.16b\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "zip1 v15.4s, v15.4s, v17.4s\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "ext v18.16b, v18.16b, v18.16b, #0x2\n" + "ld1r { v14.4s }, [x9]\n" + "zip1 v20.4s, v20.4s, v23.4s\n" + "ld1r { v27.4s }, [x28]\n" + "zip1 v15.4s, v15.4s, v20.4s\n" + "ld1r { v23.4s }, [x27]\n" + "mov v17.16b, v29.16b\n" + "ldr q6, [%x[params], #0x0]\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "ldr q8, [%x[params], #0x10]\n" + "mov v11.16b, v29.16b\n" + "ldr q9, [%x[params], #0x20]\n" + "ext v11.16b, v11.16b, v11.16b, #0x6\n" + "ldr q10, [%x[params], #0x30]\n" + "add %x[params], %x[params], #0x40\n" + "zip1 v29.4s, v29.4s, v17.4s\n" + "mov v12.16b, v0.16b\n" + "ext v12.16b, v12.16b, v12.16b, #0x2\n" + "zip1 v18.4s, v18.4s, v11.4s\n" + "zip1 v29.4s, v29.4s, v18.4s\n" + "mov v17.16b, v0.16b\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "mov v11.16b, v0.16b\n" + "ext v11.16b, v11.16b, v11.16b, #0x6\n" + "mov v18.16b, v2.16b\n" + "zip1 v0.4s, v0.4s, v17.4s\n" + "ext v18.16b, v18.16b, v18.16b, #0x2\n" + "zip1 v12.4s, v12.4s, v11.4s\n" + "zip1 v0.4s, v0.4s, v12.4s\n" + "mov v17.16b, v2.16b\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "mov v19.16b, v2.16b\n" + "ext v19.16b, v19.16b, v19.16b, #0x6\n" + "mov v28.16b, v7.16b\n" + "zip1 v2.4s, v2.4s, v17.4s\n" + "ext v28.16b, v28.16b, v28.16b, #0x2\n" + "zip1 v18.4s, v18.4s, v19.4s\n" + "zip1 v2.4s, v2.4s, v18.4s\n" + "mov v18.16b, v7.16b\n" + "ext v18.16b, v18.16b, v18.16b, #0x4\n" + "mov v21.16b, v7.16b\n" + "ext v21.16b, v21.16b, v21.16b, #0x6\n" + "movi v30.4s, #0x0\n" + "zip1 v7.4s, v7.4s, v18.4s\n" + "movi v3.4s, #0x0\n" + "zip1 v28.4s, v28.4s, v21.4s\n" + "zip1 v7.4s, v7.4s, v28.4s\n" + "movi v12.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v31.4s, #0x0\n" + ".inst 0x4f8fe0ba // sdot v26.4s, v5.16b, v15.4b[0]\n" + ".inst 0x4fafe0a1 // sdot v1.4s, v5.16b, v15.4b[1]\n" + ".inst 0x4f8fe8b6 // sdot v22.4s, v5.16b, v15.4b[2]\n" + ".inst 0x4fafe8b9 // sdot v25.4s, v5.16b, v15.4b[3]\n" + ".inst 0x4f9de0ad // sdot v13.4s, v5.16b, v29.4b[0]\n" + ".inst 0x4fbde0be // sdot v30.4s, v5.16b, v29.4b[1]\n" + ".inst 0x4f9de8a3 // sdot v3.4s, v5.16b, v29.4b[2]\n" + ".inst 0x4fbde8ac // sdot v12.4s, v5.16b, v29.4b[3]\n" + ".inst 0x4f80e0ab // sdot v11.4s, v5.16b, v0.4b[0]\n" + ".inst 0x4fa0e0b3 // sdot v19.4s, v5.16b, v0.4b[1]\n" + ".inst 0x4f80e8b5 // sdot v21.4s, v5.16b, v0.4b[2]\n" + ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x4f82e0b0 // sdot v16.4s, v5.16b, v2.4b[0]\n" + ".inst 0x4fa2e0bc // sdot v28.4s, v5.16b, v2.4b[1]\n" + ".inst 0x4f82e8b2 // sdot v18.4s, v5.16b, v2.4b[2]\n" + ".inst 0x4fa2e8b4 // sdot v20.4s, v5.16b, v2.4b[3]\n" + ".inst 0x4f87e0b8 // sdot v24.4s, v5.16b, v7.4b[0]\n" + ".inst 0x4fa7e0bf // sdot v31.4s, v5.16b, v7.4b[1]\n" + "mov v26.16b, v26.16b\n" + "mov v1.16b, v1.16b\n" + "mov v22.16b, v22.16b\n" + "mov v25.16b, v25.16b\n" + "add v26.4s, v26.4s, v13.4s\n" + "movi v13.4s, #0x0\n" + ".inst 0x4f87e8ad // sdot v13.4s, v5.16b, v7.4b[2]\n" + "add v1.4s, v1.4s, v30.4s\n" + "movi v30.4s, #0x0\n" + ".inst 0x4fa7e8be // sdot v30.4s, v5.16b, v7.4b[3]\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v25.4s, v25.4s, v12.4s\n" + "add v26.4s, v26.4s, v11.4s\n" + "add v1.4s, v1.4s, v19.4s\n" + "add v22.4s, v22.4s, v21.4s\n" + "add v25.4s, v25.4s, v17.4s\n" + "mov v11.16b, v11.16b\n" + "mov v3.16b, v19.16b\n" + "mov v19.16b, v21.16b\n" + "mov v21.16b, v17.16b\n" + "add v11.4s, v11.4s, v16.4s\n" + "add v3.4s, v3.4s, v28.4s\n" + "add v19.4s, v19.4s, v18.4s\n" + "add v21.4s, v21.4s, v20.4s\n" + "add v11.4s, v11.4s, v24.4s\n" + "add v3.4s, v3.4s, v31.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "neg v4.4s, v4.4s\n" + "mul v26.4s, v26.4s, v4.4s\n" + "str q26, [SP, #0x0]\n" + "mul v1.4s, v1.4s, v4.4s\n" + "mul v22.4s, v22.4s, v4.4s\n" + "str q1, [SP, #0x10]\n" + "mul v25.4s, v25.4s, v4.4s\n" + "mul v11.4s, v11.4s, v4.4s\n" + "str q22, [SP, #0x20]\n" + "mul v3.4s, v3.4s, v4.4s\n" + "str q25, [SP, #0x30]\n" + "mul v19.4s, v19.4s, v4.4s\n" + "mul v21.4s, v21.4s, v4.4s\n" + "str q11, [SP, #0x40]\n" + "add v26.4s, v26.4s, v6.4s\n" + "str q3, [SP, #0x50]\n" + "add v1.4s, v1.4s, v6.4s\n" + "str q19, [SP, #0x60]\n" + "add v22.4s, v22.4s, v6.4s\n" + "add v25.4s, v25.4s, v6.4s\n" + "str q21, [SP, #0x70]\n" + "add v11.4s, v11.4s, v6.4s\n" + "add v3.4s, v3.4s, v6.4s\n" + "add v19.4s, v19.4s, v6.4s\n" + "add v21.4s, v21.4s, v6.4s\n" + "ble 2f\n" + "1:" // Loop + ".inst 0x4f8fe11a // sdot v26.4s, v8.16b, v15.4b[0]\n" + "ldr q20, [%x[params], #0x0]\n" + "add x11, x11, #0x10\n" + ".inst 0x4fafe101 // sdot v1.4s, v8.16b, v15.4b[1]\n" + "ldr q4, [%x[params], #0x10]\n" + "sub %x[n_channels], %x[n_channels], #0x4\n" + ".inst 0x4f8fe916 // sdot v22.4s, v8.16b, v15.4b[2]\n" + "ldr q6, [%x[params], #0x20]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x4fafe919 // sdot v25.4s, v8.16b, v15.4b[3]\n" + ".inst 0x4f80e10b // sdot v11.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4fa0e103 // sdot v3.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4fa0e915 // sdot v21.4s, v8.16b, v0.4b[3]\n" + "ldr q8, [%x[params], #0x30]\n" + ".inst 0x4f9de13a // sdot v26.4s, v9.16b, v29.4b[0]\n" + ".inst 0x4fbde121 // sdot v1.4s, v9.16b, v29.4b[1]\n" + ".inst 0x4f9de936 // sdot v22.4s, v9.16b, v29.4b[2]\n" + ".inst 0x4fbde939 // sdot v25.4s, v9.16b, v29.4b[3]\n" + ".inst 0x4f82e12b // sdot v11.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4fa2e123 // sdot v3.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4f82e933 // sdot v19.4s, v9.16b, v2.4b[2]\n" + ".inst 0x4fa2e935 // sdot v21.4s, v9.16b, v2.4b[3]\n" + "ldr q9, [%x[params], #0x40]\n" + ".inst 0x4f80e15a // sdot v26.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4fa0e141 // sdot v1.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4f80e956 // sdot v22.4s, v10.16b, v0.4b[2]\n" + ".inst 0x4fa0e959 // sdot v25.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f87e14b // sdot v11.4s, v10.16b, v7.4b[0]\n" + ".inst 0x4fa7e143 // sdot v3.4s, v10.16b, v7.4b[1]\n" + ".inst 0x4f87e953 // sdot v19.4s, v10.16b, v7.4b[2]\n" + ".inst 0x4fa7e955 // sdot v21.4s, v10.16b, v7.4b[3]\n" + "ldr q10, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v1.4s, v1.4s, v20.4s\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "sqrdmulh v11.4s, v11.4s, v20.4s\n" + "and v30.16b, v26.16b, v4.16b\n" + "and v17.16b, v1.16b, v4.16b\n" + "and v16.16b, v22.16b, v4.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v30.4s\n" + "sqadd v1.4s, v1.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v25.16b, v4.16b\n" + "srshl v26.4s, v26.4s, v4.4s\n" + "srshl v1.4s, v1.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v4.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v26.4s, v26.4s, v14.4s\n" + "add v1.4s, v1.4s, v14.4s\n" + "add v22.4s, v22.4s, v14.4s\n" + "smin v26.4s, v26.4s, v23.4s\n" + "smin v1.4s, v1.4s, v23.4s\n" + "smin v22.4s, v22.4s, v23.4s\n" + "smax v26.4s, v26.4s, v27.4s\n" + "smax v1.4s, v1.4s, v27.4s\n" + "smax v22.4s, v22.4s, v27.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x26, x10]\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q26, [SP, #0x0]\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "str s1, [x25, x10]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q1, [SP, #0x10]\n" + "and v16.16b, v11.16b, v4.16b\n" + "str s22, [x24, x10]\n" + "sqrdmulh v3.4s, v3.4s, v20.4s\n" + "ldr q22, [SP, #0x20]\n" + "srshl v25.4s, v25.4s, v4.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" + "and v17.16b, v3.16b, v4.16b\n" + "add v25.4s, v25.4s, v14.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v23.4s\n" + "and v16.16b, v19.16b, v4.16b\n" + "srshl v11.4s, v11.4s, v4.4s\n" + "smax v25.4s, v25.4s, v27.4s\n" + "sqadd v3.4s, v3.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v11.4s, v11.4s, v14.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x23, x10]\n" + "smin v11.4s, v11.4s, v23.4s\n" + "srshl v3.4s, v3.4s, v4.4s\n" + "ldr q25, [SP, #0x30]\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v11.4s, v11.4s, v27.4s\n" + "add v3.4s, v3.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v4.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "smin v3.4s, v3.4s, v23.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str s11, [x22, x10]\n" + "smax v3.4s, v3.4s, v27.4s\n" + "add v19.4s, v19.4s, v14.4s\n" + "ldr q11, [SP, #0x40]\n" + "and v16.16b, v21.16b, v4.16b\n" + "add v26.4s, v26.4s, v6.4s\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "smin v19.4s, v19.4s, v23.4s\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "str s3, [x21, x10]\n" + "smax v19.4s, v19.4s, v27.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr q3, [SP, #0x50]\n" + "add v1.4s, v1.4s, v6.4s\n" + "add v22.4s, v22.4s, v6.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x20, x10]\n" + "add v25.4s, v25.4s, v6.4s\n" + "add v11.4s, v11.4s, v6.4s\n" + "ldr q19, [SP, #0x60]\n" + "srshl v21.4s, v21.4s, v4.4s\n" + "add v3.4s, v3.4s, v6.4s\n" + "add v21.4s, v21.4s, v14.4s\n" + "add v19.4s, v19.4s, v6.4s\n" + "smin v21.4s, v21.4s, v23.4s\n" + "smax v21.4s, v21.4s, v27.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x19, x10]\n" + "add x10, x10, #0x4\n" + "ldr q21, [SP, #0x70]\n" + "add v21.4s, v21.4s, v6.4s\n" + "bgt 1b\n" + "2:" // Tail + ".inst 0x4f8fe11a // sdot v26.4s, v8.16b, v15.4b[0]\n" + "ldr q20, [%x[params], #0x0]\n" + "add x26, x26, x10\n" + ".inst 0x4fafe101 // sdot v1.4s, v8.16b, v15.4b[1]\n" + "ldr q4, [%x[params], #0x10]\n" + "add x25, x25, x10\n" + ".inst 0x4f8fe916 // sdot v22.4s, v8.16b, v15.4b[2]\n" + "add x24, x24, x10\n" + ".inst 0x4fafe919 // sdot v25.4s, v8.16b, v15.4b[3]\n" + "add x23, x23, x10\n" + ".inst 0x4f80e10b // sdot v11.4s, v8.16b, v0.4b[0]\n" + "add x22, x22, x10\n" + ".inst 0x4fa0e103 // sdot v3.4s, v8.16b, v0.4b[1]\n" + "add x21, x21, x10\n" + ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n" + "add x20, x20, x10\n" + ".inst 0x4fa0e915 // sdot v21.4s, v8.16b, v0.4b[3]\n" + "add x19, x19, x10\n" + ".inst 0x4f9de13a // sdot v26.4s, v9.16b, v29.4b[0]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x4fbde121 // sdot v1.4s, v9.16b, v29.4b[1]\n" + "add %x[params], %x[params], #0x20\n" + ".inst 0x4f9de936 // sdot v22.4s, v9.16b, v29.4b[2]\n" + ".inst 0x4fbde939 // sdot v25.4s, v9.16b, v29.4b[3]\n" + ".inst 0x4f82e12b // sdot v11.4s, v9.16b, v2.4b[0]\n" + ".inst 0x4fa2e123 // sdot v3.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4f82e933 // sdot v19.4s, v9.16b, v2.4b[2]\n" + ".inst 0x4fa2e935 // sdot v21.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4f80e15a // sdot v26.4s, v10.16b, v0.4b[0]\n" + ".inst 0x4fa0e141 // sdot v1.4s, v10.16b, v0.4b[1]\n" + ".inst 0x4f80e956 // sdot v22.4s, v10.16b, v0.4b[2]\n" + ".inst 0x4fa0e959 // sdot v25.4s, v10.16b, v0.4b[3]\n" + ".inst 0x4f87e14b // sdot v11.4s, v10.16b, v7.4b[0]\n" + ".inst 0x4fa7e143 // sdot v3.4s, v10.16b, v7.4b[1]\n" + ".inst 0x4f87e953 // sdot v19.4s, v10.16b, v7.4b[2]\n" + ".inst 0x4fa7e955 // sdot v21.4s, v10.16b, v7.4b[3]\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v1.4s, v1.4s, v20.4s\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "and v30.16b, v26.16b, v4.16b\n" + "and v17.16b, v1.16b, v4.16b\n" + "and v16.16b, v22.16b, v4.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v30.4s\n" + "sqadd v1.4s, v1.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v25.16b, v4.16b\n" + "srshl v26.4s, v26.4s, v4.4s\n" + "srshl v1.4s, v1.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v4.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v26.4s, v26.4s, v14.4s\n" + "add v1.4s, v1.4s, v14.4s\n" + "add v22.4s, v22.4s, v14.4s\n" + "smin v26.4s, v26.4s, v23.4s\n" + "smin v1.4s, v1.4s, v23.4s\n" + "smin v22.4s, v22.4s, v23.4s\n" + "smax v26.4s, v26.4s, v27.4s\n" + "smax v1.4s, v1.4s, v27.4s\n" + "smax v22.4s, v22.4s, v27.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v11.4s, v11.4s, v20.4s\n" + "sqrdmulh v3.4s, v3.4s, v20.4s\n" + "srshl v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" + "and v16.16b, v11.16b, v4.16b\n" + "and v17.16b, v3.16b, v4.16b\n" + "add v25.4s, v25.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v23.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" + "sqadd v3.4s, v3.4s, v17.4s\n" + "smax v25.4s, v25.4s, v27.4s\n" + "and v16.16b, v19.16b, v4.16b\n" + "srshl v11.4s, v11.4s, v4.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "srshl v3.4s, v3.4s, v4.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v11.4s, v11.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v3.4s, v3.4s, v14.4s\n" + "smin v11.4s, v11.4s, v23.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v3.4s, v3.4s, v23.4s\n" + "smax v11.4s, v11.4s, v27.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v3.4s, v3.4s, v27.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "srshl v19.4s, v19.4s, v4.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "and v16.16b, v21.16b, v4.16b\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "add v19.4s, v19.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v19.4s, v19.4s, v23.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v19.4s, v19.4s, v27.4s\n" + "srshl v21.4s, v21.4s, v4.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "add v21.4s, v21.4s, v14.4s\n" + "smin v21.4s, v21.4s, v23.4s\n" + "smax v21.4s, v21.4s, v27.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "blt 3f\n" + "str s26, [x26, #0x0]\n" + "str s1, [x25, #0x0]\n" + "str s22, [x24, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s11, [x22, #0x0]\n" + "str s3, [x21, #0x0]\n" + "str s19, [x20, #0x0]\n" + "str s21, [x19, #0x0]\n" + "b 4f\n" + "3:" // Tail: Oddments + "st1 { v26.b }[0], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v25.b }[0], [x23], #0x1\n" + "st1 { v11.b }[0], [x22], #0x1\n" + "st1 { v3.b }[0], [x21], #0x1\n" + "st1 { v19.b }[0], [x20], #0x1\n" + "st1 { v21.b }[0], [x19], #0x1\n" + "beq 4f\n" + "st1 { v26.b }[1], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[1], [x25], #0x1\n" + "st1 { v22.b }[1], [x24], #0x1\n" + "st1 { v25.b }[1], [x23], #0x1\n" + "st1 { v11.b }[1], [x22], #0x1\n" + "st1 { v3.b }[1], [x21], #0x1\n" + "st1 { v19.b }[1], [x20], #0x1\n" + "st1 { v21.b }[1], [x19], #0x1\n" + "beq 4f\n" + "st1 { v26.b }[2], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v25.b }[2], [x23], #0x1\n" + "st1 { v11.b }[2], [x22], #0x1\n" + "st1 { v3.b }[2], [x21], #0x1\n" + "st1 { v19.b }[2], [x20], #0x1\n" + "st1 { v21.b }[2], [x19], #0x1\n" + "beq 4f\n" + "st1 { v26.b }[3], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[3], [x25], #0x1\n" + "st1 { v22.b }[3], [x24], #0x1\n" + "st1 { v25.b }[3], [x23], #0x1\n" + "st1 { v11.b }[3], [x22], #0x1\n" + "st1 { v3.b }[3], [x21], #0x1\n" + "st1 { v19.b }[3], [x20], #0x1\n" + "st1 { v21.b }[3], [x19], #0x1\n" + "4:" // Tail: End + "add SP, SP, #0x80\n" + : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params) + : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..9a3eed47fb --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 8; + constexpr static unsigned int input_cols = 6; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + + a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..95ad78cf6c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl( + const int8_t *const *const inptrs, + int8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "movi v15.16b, #0x1\n" + "ldr x21, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "movi v14.4s, #0x1\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "movi v28.4s, #0x0\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "mov x11, #0x0\n" + "movi v27.4s, #0x0\n" + "ld1 { v13.16b }, [x21]\n" + "mov x10, #0x0\n" + "movi v26.4s, #0x0\n" + "ld1 { v12.16b }, [x20]\n" + "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "movi v25.4s, #0x0\n" + "ld1 { v7.16b }, [x19]\n" + "add x28, %x[qp], %[offsetof_Requantize32_minval]\n" + "movi v24.4s, #0x0\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n" + "mov v18.16b, v13.16b\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "cmp %x[n_channels], #0x4\n" + "ext v18.16b, v18.16b, v18.16b, #0x1\n" + "ldr x19, [%x[inptrs], #0x28]\n" + "mov v17.16b, v12.16b\n" + "ld1 { v6.16b }, [x21]\n" + "ext v17.16b, v17.16b, v17.16b, #0x1\n" + "ld1 { v5.16b }, [x20]\n" + "mov v16.16b, v7.16b\n" + "ld1 { v4.16b }, [x19]\n" + "ext v16.16b, v16.16b, v16.16b, #0x1\n" + "ldr x20, [%x[inptrs], #0x30]\n" + "zip1 v13.2d, v13.2d, v18.2d\n" + "ldr x19, [%x[inptrs], #0x38]\n" + "zip1 v12.2d, v12.2d, v17.2d\n" + "ld1r { v3.4s }, [x22]\n" + "mov v18.16b, v6.16b\n" + "ld1 { v2.16b }, [x20]\n" + "zip1 v7.2d, v7.2d, v16.2d\n" + "ld1 { v1.16b }, [x19]\n" + "ext v18.16b, v18.16b, v18.16b, #0x1\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "mov v17.16b, v5.16b\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "ext v17.16b, v17.16b, v17.16b, #0x1\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "mov v16.16b, v4.16b\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "zip1 v6.2d, v6.2d, v18.2d\n" + "ld1r { v0.4s }, [x9]\n" + "ext v16.16b, v16.16b, v16.16b, #0x1\n" + "ld1r { v31.4s }, [x28]\n" + "zip1 v5.2d, v5.2d, v17.2d\n" + "ld1r { v30.4s }, [x27]\n" + "mov v17.16b, v2.16b\n" + "ldr q29, [%x[params], #0x0]\n" + "ext v17.16b, v17.16b, v17.16b, #0x1\n" + "ldr q8, [%x[params], #0x10]\n" + "zip1 v4.2d, v4.2d, v16.2d\n" + "ldr q9, [%x[params], #0x20]\n" + "mov v16.16b, v1.16b\n" + "ldr q10, [%x[params], #0x30]\n" + "ext v16.16b, v16.16b, v16.16b, #0x1\n" + "ldr q11, [%x[params], #0x40]\n" + "add %x[params], %x[params], #0x50\n" + "zip1 v2.2d, v2.2d, v17.2d\n" + "movi v23.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "zip1 v1.2d, v1.2d, v16.2d\n" + "movi v21.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v19.4s, #0x0\n" + ".inst 0x4f8de1fc // sdot v28.4s, v15.16b, v13.4b[0]\n" + ".inst 0x4f8de9fb // sdot v27.4s, v15.16b, v13.4b[2]\n" + ".inst 0x4f8ce1fa // sdot v26.4s, v15.16b, v12.4b[0]\n" + ".inst 0x4f8ce9f9 // sdot v25.4s, v15.16b, v12.4b[2]\n" + ".inst 0x4fade1dc // sdot v28.4s, v14.16b, v13.4b[1]\n" + ".inst 0x4fade9db // sdot v27.4s, v14.16b, v13.4b[3]\n" + ".inst 0x4face1da // sdot v26.4s, v14.16b, v12.4b[1]\n" + ".inst 0x4face9d9 // sdot v25.4s, v14.16b, v12.4b[3]\n" + ".inst 0x4f87e1f8 // sdot v24.4s, v15.16b, v7.4b[0]\n" + ".inst 0x4f87e9f7 // sdot v23.4s, v15.16b, v7.4b[2]\n" + ".inst 0x4f86e1f6 // sdot v22.4s, v15.16b, v6.4b[0]\n" + ".inst 0x4f86e9f5 // sdot v21.4s, v15.16b, v6.4b[2]\n" + ".inst 0x4fa7e1d8 // sdot v24.4s, v14.16b, v7.4b[1]\n" + ".inst 0x4fa7e9d7 // sdot v23.4s, v14.16b, v7.4b[3]\n" + ".inst 0x4fa6e1d6 // sdot v22.4s, v14.16b, v6.4b[1]\n" + ".inst 0x4fa6e9d5 // sdot v21.4s, v14.16b, v6.4b[3]\n" + ".inst 0x4f85e1f2 // sdot v18.4s, v15.16b, v5.4b[0]\n" + ".inst 0x4f85e9f1 // sdot v17.4s, v15.16b, v5.4b[2]\n" + ".inst 0x4f84e1f0 // sdot v16.4s, v15.16b, v4.4b[0]\n" + ".inst 0x4f84e9f4 // sdot v20.4s, v15.16b, v4.4b[2]\n" + ".inst 0x4fa5e1d2 // sdot v18.4s, v14.16b, v5.4b[1]\n" + ".inst 0x4fa5e9d1 // sdot v17.4s, v14.16b, v5.4b[3]\n" + ".inst 0x4fa4e1d0 // sdot v16.4s, v14.16b, v4.4b[1]\n" + ".inst 0x4fa4e9d4 // sdot v20.4s, v14.16b, v4.4b[3]\n" + ".inst 0x4f82e1f3 // sdot v19.4s, v15.16b, v2.4b[0]\n" + "mov v28.16b, v28.16b\n" + "mov v27.16b, v27.16b\n" + "add v28.4s, v28.4s, v26.4s\n" + ".inst 0x4fa2e1d3 // sdot v19.4s, v14.16b, v2.4b[1]\n" + "add v27.4s, v27.4s, v25.4s\n" + "add v28.4s, v28.4s, v24.4s\n" + "mov v26.16b, v26.16b\n" + "add v27.4s, v27.4s, v23.4s\n" + "add v28.4s, v28.4s, v22.4s\n" + "mov v25.16b, v25.16b\n" + "add v27.4s, v27.4s, v21.4s\n" + "add v28.4s, v28.4s, v18.4s\n" + "add v26.4s, v26.4s, v24.4s\n" + "add v27.4s, v27.4s, v17.4s\n" + "add v25.4s, v25.4s, v23.4s\n" + "add v26.4s, v26.4s, v22.4s\n" + "mov v24.16b, v24.16b\n" + "add v25.4s, v25.4s, v21.4s\n" + "add v26.4s, v26.4s, v18.4s\n" + "mov v23.16b, v23.16b\n" + "add v25.4s, v25.4s, v17.4s\n" + "add v26.4s, v26.4s, v16.4s\n" + "add v24.4s, v24.4s, v22.4s\n" + "add v25.4s, v25.4s, v20.4s\n" + "add v23.4s, v23.4s, v21.4s\n" + "add v24.4s, v24.4s, v18.4s\n" + "mov v22.16b, v22.16b\n" + "add v23.4s, v23.4s, v17.4s\n" + "add v24.4s, v24.4s, v16.4s\n" + "mov v21.16b, v21.16b\n" + "add v23.4s, v23.4s, v20.4s\n" + "add v24.4s, v24.4s, v19.4s\n" + "add v22.4s, v22.4s, v18.4s\n" + "movi v18.4s, #0x0\n" + ".inst 0x4f82e9f2 // sdot v18.4s, v15.16b, v2.4b[2]\n" + "add v21.4s, v21.4s, v17.4s\n" + "movi v17.4s, #0x0\n" + ".inst 0x4f81e1f1 // sdot v17.4s, v15.16b, v1.4b[0]\n" + ".inst 0x4fa2e9d2 // sdot v18.4s, v14.16b, v2.4b[3]\n" + "add v22.4s, v22.4s, v16.4s\n" + "movi v16.4s, #0x0\n" + ".inst 0x4fa1e1d1 // sdot v17.4s, v14.16b, v1.4b[1]\n" + ".inst 0x4f81e9f0 // sdot v16.4s, v15.16b, v1.4b[2]\n" + "add v23.4s, v23.4s, v18.4s\n" + "add v21.4s, v21.4s, v20.4s\n" + "add v22.4s, v22.4s, v19.4s\n" + ".inst 0x4fa1e9d0 // sdot v16.4s, v14.16b, v1.4b[3]\n" + "add v21.4s, v21.4s, v18.4s\n" + "add v22.4s, v22.4s, v17.4s\n" + "neg v3.4s, v3.4s\n" + "add v21.4s, v21.4s, v16.4s\n" + "mul v28.4s, v28.4s, v3.4s\n" + "str q28, [SP, #0x0]\n" + "mul v27.4s, v27.4s, v3.4s\n" + "mul v26.4s, v26.4s, v3.4s\n" + "str q27, [SP, #0x10]\n" + "mul v25.4s, v25.4s, v3.4s\n" + "mul v24.4s, v24.4s, v3.4s\n" + "str q26, [SP, #0x20]\n" + "mul v23.4s, v23.4s, v3.4s\n" + "str q25, [SP, #0x30]\n" + "mul v22.4s, v22.4s, v3.4s\n" + "mul v21.4s, v21.4s, v3.4s\n" + "str q24, [SP, #0x40]\n" + "add v28.4s, v28.4s, v29.4s\n" + "str q23, [SP, #0x50]\n" + "add v27.4s, v27.4s, v29.4s\n" + "str q22, [SP, #0x60]\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "str q21, [SP, #0x70]\n" + "add v24.4s, v24.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "ble 2f\n" + "1:" // Loop + ".inst 0x4f8de11c // sdot v28.4s, v8.16b, v13.4b[0]\n" + "ldr q20, [%x[params], #0x60]\n" + "add x11, x11, #0x10\n" + ".inst 0x4f8de91b // sdot v27.4s, v8.16b, v13.4b[2]\n" + "ldr q19, [%x[params], #0x70]\n" + "sub %x[n_channels], %x[n_channels], #0x4\n" + ".inst 0x4f8ce11a // sdot v26.4s, v8.16b, v12.4b[0]\n" + "ldr q29, [%x[params], #0x80]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x4f8ce919 // sdot v25.4s, v8.16b, v12.4b[2]\n" + ".inst 0x4f87e118 // sdot v24.4s, v8.16b, v7.4b[0]\n" + ".inst 0x4f87e917 // sdot v23.4s, v8.16b, v7.4b[2]\n" + ".inst 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]\n" + ".inst 0x4f86e915 // sdot v21.4s, v8.16b, v6.4b[2]\n" + "ldr q8, [%x[params], #0x0]\n" + ".inst 0x4fade13c // sdot v28.4s, v9.16b, v13.4b[1]\n" + ".inst 0x4fade93b // sdot v27.4s, v9.16b, v13.4b[3]\n" + ".inst 0x4face13a // sdot v26.4s, v9.16b, v12.4b[1]\n" + ".inst 0x4face939 // sdot v25.4s, v9.16b, v12.4b[3]\n" + ".inst 0x4fa7e138 // sdot v24.4s, v9.16b, v7.4b[1]\n" + ".inst 0x4fa7e937 // sdot v23.4s, v9.16b, v7.4b[3]\n" + ".inst 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]\n" + ".inst 0x4fa6e935 // sdot v21.4s, v9.16b, v6.4b[3]\n" + "ldr q9, [%x[params], #0x10]\n" + ".inst 0x4f8ce15c // sdot v28.4s, v10.16b, v12.4b[0]\n" + ".inst 0x4f8ce95b // sdot v27.4s, v10.16b, v12.4b[2]\n" + ".inst 0x4f87e15a // sdot v26.4s, v10.16b, v7.4b[0]\n" + ".inst 0x4f87e959 // sdot v25.4s, v10.16b, v7.4b[2]\n" + ".inst 0x4f86e158 // sdot v24.4s, v10.16b, v6.4b[0]\n" + ".inst 0x4f86e957 // sdot v23.4s, v10.16b, v6.4b[2]\n" + ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + ".inst 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]\n" + "ldr q10, [%x[params], #0x20]\n" + ".inst 0x4face17c // sdot v28.4s, v11.16b, v12.4b[1]\n" + ".inst 0x4face97b // sdot v27.4s, v11.16b, v12.4b[3]\n" + ".inst 0x4fa7e17a // sdot v26.4s, v11.16b, v7.4b[1]\n" + ".inst 0x4fa7e979 // sdot v25.4s, v11.16b, v7.4b[3]\n" + ".inst 0x4fa6e178 // sdot v24.4s, v11.16b, v6.4b[1]\n" + ".inst 0x4fa6e977 // sdot v23.4s, v11.16b, v6.4b[3]\n" + ".inst 0x4fa5e176 // sdot v22.4s, v11.16b, v5.4b[1]\n" + ".inst 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]\n" + "ldr q11, [%x[params], #0x30]\n" + ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" + ".inst 0x4f87e91b // sdot v27.4s, v8.16b, v7.4b[2]\n" + ".inst 0x4f86e11a // sdot v26.4s, v8.16b, v6.4b[0]\n" + ".inst 0x4f86e919 // sdot v25.4s, v8.16b, v6.4b[2]\n" + ".inst 0x4f85e118 // sdot v24.4s, v8.16b, v5.4b[0]\n" + ".inst 0x4f85e917 // sdot v23.4s, v8.16b, v5.4b[2]\n" + ".inst 0x4f84e116 // sdot v22.4s, v8.16b, v4.4b[0]\n" + ".inst 0x4f84e915 // sdot v21.4s, v8.16b, v4.4b[2]\n" + "ldr q8, [%x[params], #0x40]\n" + ".inst 0x4fa7e13c // sdot v28.4s, v9.16b, v7.4b[1]\n" + ".inst 0x4fa7e93b // sdot v27.4s, v9.16b, v7.4b[3]\n" + ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n" + ".inst 0x4fa6e939 // sdot v25.4s, v9.16b, v6.4b[3]\n" + ".inst 0x4fa5e138 // sdot v24.4s, v9.16b, v5.4b[1]\n" + ".inst 0x4fa5e937 // sdot v23.4s, v9.16b, v5.4b[3]\n" + ".inst 0x4fa4e136 // sdot v22.4s, v9.16b, v4.4b[1]\n" + ".inst 0x4fa4e935 // sdot v21.4s, v9.16b, v4.4b[3]\n" + "ldr q9, [%x[params], #0x50]\n" + ".inst 0x4f86e15c // sdot v28.4s, v10.16b, v6.4b[0]\n" + ".inst 0x4f86e95b // sdot v27.4s, v10.16b, v6.4b[2]\n" + ".inst 0x4f85e15a // sdot v26.4s, v10.16b, v5.4b[0]\n" + ".inst 0x4f85e959 // sdot v25.4s, v10.16b, v5.4b[2]\n" + ".inst 0x4f84e158 // sdot v24.4s, v10.16b, v4.4b[0]\n" + ".inst 0x4f84e957 // sdot v23.4s, v10.16b, v4.4b[2]\n" + ".inst 0x4f82e156 // sdot v22.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f82e955 // sdot v21.4s, v10.16b, v2.4b[2]\n" + "ldr q10, [%x[params], #0xb0]\n" + ".inst 0x4fa6e17c // sdot v28.4s, v11.16b, v6.4b[1]\n" + ".inst 0x4fa6e97b // sdot v27.4s, v11.16b, v6.4b[3]\n" + ".inst 0x4fa5e17a // sdot v26.4s, v11.16b, v5.4b[1]\n" + ".inst 0x4fa5e979 // sdot v25.4s, v11.16b, v5.4b[3]\n" + ".inst 0x4fa4e178 // sdot v24.4s, v11.16b, v4.4b[1]\n" + ".inst 0x4fa4e977 // sdot v23.4s, v11.16b, v4.4b[3]\n" + ".inst 0x4fa2e176 // sdot v22.4s, v11.16b, v2.4b[1]\n" + ".inst 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]\n" + "ldr q11, [%x[params], #0xc0]\n" + ".inst 0x4f85e11c // sdot v28.4s, v8.16b, v5.4b[0]\n" + ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n" + ".inst 0x4f84e11a // sdot v26.4s, v8.16b, v4.4b[0]\n" + ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n" + ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f82e917 // sdot v23.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" + "ldr q8, [%x[params], #0x90]\n" + ".inst 0x4fa5e13c // sdot v28.4s, v9.16b, v5.4b[1]\n" + ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n" + ".inst 0x4fa4e13a // sdot v26.4s, v9.16b, v4.4b[1]\n" + ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n" + ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa2e937 // sdot v23.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa1e136 // sdot v22.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa1e935 // sdot v21.4s, v9.16b, v1.4b[3]\n" + "ldr q9, [%x[params], #0xa0]\n" + "add %x[params], %x[params], #0xd0\n" + "sqrdmulh v28.4s, v28.4s, v20.4s\n" + "sqrdmulh v27.4s, v27.4s, v20.4s\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "sqrdmulh v24.4s, v24.4s, v20.4s\n" + "and v18.16b, v28.16b, v19.16b\n" + "and v17.16b, v27.16b, v19.16b\n" + "and v16.16b, v26.16b, v19.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v18.4s\n" + "sqadd v27.4s, v27.4s, v17.4s\n" + "sqadd v26.4s, v26.4s, v16.4s\n" + "and v16.16b, v25.16b, v19.16b\n" + "srshl v28.4s, v28.4s, v19.4s\n" + "srshl v27.4s, v27.4s, v19.4s\n" + "srshl v26.4s, v26.4s, v19.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "smin v28.4s, v28.4s, v30.4s\n" + "smin v27.4s, v27.4s, v30.4s\n" + "smin v26.4s, v26.4s, v30.4s\n" + "smax v28.4s, v28.4s, v31.4s\n" + "smax v27.4s, v27.4s, v31.4s\n" + "smax v26.4s, v26.4s, v31.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x26, x10]\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "ldr q28, [SP, #0x0]\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "str s27, [x25, x10]\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "ldr q27, [SP, #0x10]\n" + "and v16.16b, v24.16b, v19.16b\n" + "str s26, [x24, x10]\n" + "sqrdmulh v23.4s, v23.4s, v20.4s\n" + "ldr q26, [SP, #0x20]\n" + "srshl v25.4s, v25.4s, v19.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "and v17.16b, v23.16b, v19.16b\n" + "add v25.4s, v25.4s, v0.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v30.4s\n" + "and v16.16b, v22.16b, v19.16b\n" + "srshl v24.4s, v24.4s, v19.4s\n" + "smax v25.4s, v25.4s, v31.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v0.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x23, x10]\n" + "smin v24.4s, v24.4s, v30.4s\n" + "srshl v23.4s, v23.4s, v19.4s\n" + "ldr q25, [SP, #0x30]\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v24.4s, v24.4s, v31.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v19.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "smin v23.4s, v23.4s, v30.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x22, x10]\n" + "smax v23.4s, v23.4s, v31.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "ldr q24, [SP, #0x40]\n" + "and v16.16b, v21.16b, v19.16b\n" + "add v28.4s, v28.4s, v29.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v22.4s, v22.4s, v30.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x21, x10]\n" + "smax v22.4s, v22.4s, v31.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr q23, [SP, #0x50]\n" + "add v27.4s, v27.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x20, x10]\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "ldr q22, [SP, #0x60]\n" + "srshl v21.4s, v21.4s, v19.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v21.4s, v21.4s, v0.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "smin v21.4s, v21.4s, v30.4s\n" + "smax v21.4s, v21.4s, v31.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x19, x10]\n" + "add x10, x10, #0x4\n" + "ldr q21, [SP, #0x70]\n" + "add v21.4s, v21.4s, v29.4s\n" + "bgt 1b\n" + "2:" // Tail + ".inst 0x4f8de11c // sdot v28.4s, v8.16b, v13.4b[0]\n" + "ldr q20, [%x[params], #0x60]\n" + "add x26, x26, x10\n" + ".inst 0x4f8de91b // sdot v27.4s, v8.16b, v13.4b[2]\n" + "ldr q19, [%x[params], #0x70]\n" + "add x25, x25, x10\n" + ".inst 0x4f8ce11a // sdot v26.4s, v8.16b, v12.4b[0]\n" + "add x24, x24, x10\n" + ".inst 0x4f8ce919 // sdot v25.4s, v8.16b, v12.4b[2]\n" + "add x23, x23, x10\n" + ".inst 0x4f87e118 // sdot v24.4s, v8.16b, v7.4b[0]\n" + "add x22, x22, x10\n" + ".inst 0x4f87e917 // sdot v23.4s, v8.16b, v7.4b[2]\n" + "add x21, x21, x10\n" + ".inst 0x4f86e116 // sdot v22.4s, v8.16b, v6.4b[0]\n" + "add x20, x20, x10\n" + ".inst 0x4f86e915 // sdot v21.4s, v8.16b, v6.4b[2]\n" + "ldr q8, [%x[params], #0x0]\n" + "add x19, x19, x10\n" + ".inst 0x4fade13c // sdot v28.4s, v9.16b, v13.4b[1]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x4fade93b // sdot v27.4s, v9.16b, v13.4b[3]\n" + ".inst 0x4face13a // sdot v26.4s, v9.16b, v12.4b[1]\n" + ".inst 0x4face939 // sdot v25.4s, v9.16b, v12.4b[3]\n" + ".inst 0x4fa7e138 // sdot v24.4s, v9.16b, v7.4b[1]\n" + ".inst 0x4fa7e937 // sdot v23.4s, v9.16b, v7.4b[3]\n" + ".inst 0x4fa6e136 // sdot v22.4s, v9.16b, v6.4b[1]\n" + ".inst 0x4fa6e935 // sdot v21.4s, v9.16b, v6.4b[3]\n" + "ldr q9, [%x[params], #0x10]\n" + ".inst 0x4f8ce15c // sdot v28.4s, v10.16b, v12.4b[0]\n" + ".inst 0x4f8ce95b // sdot v27.4s, v10.16b, v12.4b[2]\n" + ".inst 0x4f87e15a // sdot v26.4s, v10.16b, v7.4b[0]\n" + ".inst 0x4f87e959 // sdot v25.4s, v10.16b, v7.4b[2]\n" + ".inst 0x4f86e158 // sdot v24.4s, v10.16b, v6.4b[0]\n" + ".inst 0x4f86e957 // sdot v23.4s, v10.16b, v6.4b[2]\n" + ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n" + ".inst 0x4f85e955 // sdot v21.4s, v10.16b, v5.4b[2]\n" + "ldr q10, [%x[params], #0x20]\n" + ".inst 0x4face17c // sdot v28.4s, v11.16b, v12.4b[1]\n" + ".inst 0x4face97b // sdot v27.4s, v11.16b, v12.4b[3]\n" + ".inst 0x4fa7e17a // sdot v26.4s, v11.16b, v7.4b[1]\n" + ".inst 0x4fa7e979 // sdot v25.4s, v11.16b, v7.4b[3]\n" + ".inst 0x4fa6e178 // sdot v24.4s, v11.16b, v6.4b[1]\n" + ".inst 0x4fa6e977 // sdot v23.4s, v11.16b, v6.4b[3]\n" + ".inst 0x4fa5e176 // sdot v22.4s, v11.16b, v5.4b[1]\n" + ".inst 0x4fa5e975 // sdot v21.4s, v11.16b, v5.4b[3]\n" + "ldr q11, [%x[params], #0x30]\n" + ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n" + ".inst 0x4f87e91b // sdot v27.4s, v8.16b, v7.4b[2]\n" + ".inst 0x4f86e11a // sdot v26.4s, v8.16b, v6.4b[0]\n" + ".inst 0x4f86e919 // sdot v25.4s, v8.16b, v6.4b[2]\n" + ".inst 0x4f85e118 // sdot v24.4s, v8.16b, v5.4b[0]\n" + ".inst 0x4f85e917 // sdot v23.4s, v8.16b, v5.4b[2]\n" + ".inst 0x4f84e116 // sdot v22.4s, v8.16b, v4.4b[0]\n" + ".inst 0x4f84e915 // sdot v21.4s, v8.16b, v4.4b[2]\n" + "ldr q8, [%x[params], #0x40]\n" + ".inst 0x4fa7e13c // sdot v28.4s, v9.16b, v7.4b[1]\n" + ".inst 0x4fa7e93b // sdot v27.4s, v9.16b, v7.4b[3]\n" + ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n" + ".inst 0x4fa6e939 // sdot v25.4s, v9.16b, v6.4b[3]\n" + ".inst 0x4fa5e138 // sdot v24.4s, v9.16b, v5.4b[1]\n" + ".inst 0x4fa5e937 // sdot v23.4s, v9.16b, v5.4b[3]\n" + ".inst 0x4fa4e136 // sdot v22.4s, v9.16b, v4.4b[1]\n" + ".inst 0x4fa4e935 // sdot v21.4s, v9.16b, v4.4b[3]\n" + "ldr q9, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x80\n" + ".inst 0x4f86e15c // sdot v28.4s, v10.16b, v6.4b[0]\n" + ".inst 0x4f86e95b // sdot v27.4s, v10.16b, v6.4b[2]\n" + ".inst 0x4f85e15a // sdot v26.4s, v10.16b, v5.4b[0]\n" + ".inst 0x4f85e959 // sdot v25.4s, v10.16b, v5.4b[2]\n" + ".inst 0x4f84e158 // sdot v24.4s, v10.16b, v4.4b[0]\n" + ".inst 0x4f84e957 // sdot v23.4s, v10.16b, v4.4b[2]\n" + ".inst 0x4f82e156 // sdot v22.4s, v10.16b, v2.4b[0]\n" + ".inst 0x4f82e955 // sdot v21.4s, v10.16b, v2.4b[2]\n" + ".inst 0x4fa6e17c // sdot v28.4s, v11.16b, v6.4b[1]\n" + ".inst 0x4fa6e97b // sdot v27.4s, v11.16b, v6.4b[3]\n" + ".inst 0x4fa5e17a // sdot v26.4s, v11.16b, v5.4b[1]\n" + ".inst 0x4fa5e979 // sdot v25.4s, v11.16b, v5.4b[3]\n" + ".inst 0x4fa4e178 // sdot v24.4s, v11.16b, v4.4b[1]\n" + ".inst 0x4fa4e977 // sdot v23.4s, v11.16b, v4.4b[3]\n" + ".inst 0x4fa2e176 // sdot v22.4s, v11.16b, v2.4b[1]\n" + ".inst 0x4fa2e975 // sdot v21.4s, v11.16b, v2.4b[3]\n" + ".inst 0x4f85e11c // sdot v28.4s, v8.16b, v5.4b[0]\n" + ".inst 0x4f85e91b // sdot v27.4s, v8.16b, v5.4b[2]\n" + ".inst 0x4f84e11a // sdot v26.4s, v8.16b, v4.4b[0]\n" + ".inst 0x4f84e919 // sdot v25.4s, v8.16b, v4.4b[2]\n" + ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n" + ".inst 0x4f82e917 // sdot v23.4s, v8.16b, v2.4b[2]\n" + ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x4fa5e13c // sdot v28.4s, v9.16b, v5.4b[1]\n" + ".inst 0x4fa5e93b // sdot v27.4s, v9.16b, v5.4b[3]\n" + ".inst 0x4fa4e13a // sdot v26.4s, v9.16b, v4.4b[1]\n" + ".inst 0x4fa4e939 // sdot v25.4s, v9.16b, v4.4b[3]\n" + ".inst 0x4fa2e138 // sdot v24.4s, v9.16b, v2.4b[1]\n" + ".inst 0x4fa2e937 // sdot v23.4s, v9.16b, v2.4b[3]\n" + ".inst 0x4fa1e136 // sdot v22.4s, v9.16b, v1.4b[1]\n" + ".inst 0x4fa1e935 // sdot v21.4s, v9.16b, v1.4b[3]\n" + "sqrdmulh v28.4s, v28.4s, v20.4s\n" + "sqrdmulh v27.4s, v27.4s, v20.4s\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "and v18.16b, v28.16b, v19.16b\n" + "and v17.16b, v27.16b, v19.16b\n" + "and v16.16b, v26.16b, v19.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v18.4s\n" + "sqadd v27.4s, v27.4s, v17.4s\n" + "sqadd v26.4s, v26.4s, v16.4s\n" + "and v16.16b, v25.16b, v19.16b\n" + "srshl v28.4s, v28.4s, v19.4s\n" + "srshl v27.4s, v27.4s, v19.4s\n" + "srshl v26.4s, v26.4s, v19.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "smin v28.4s, v28.4s, v30.4s\n" + "smin v27.4s, v27.4s, v30.4s\n" + "smin v26.4s, v26.4s, v30.4s\n" + "smax v28.4s, v28.4s, v31.4s\n" + "smax v27.4s, v27.4s, v31.4s\n" + "smax v26.4s, v26.4s, v31.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sqrdmulh v24.4s, v24.4s, v20.4s\n" + "sqrdmulh v23.4s, v23.4s, v20.4s\n" + "srshl v25.4s, v25.4s, v19.4s\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "and v16.16b, v24.16b, v19.16b\n" + "and v17.16b, v23.16b, v19.16b\n" + "add v25.4s, v25.4s, v0.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v30.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "smax v25.4s, v25.4s, v31.4s\n" + "and v16.16b, v22.16b, v19.16b\n" + "srshl v24.4s, v24.4s, v19.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "srshl v23.4s, v23.4s, v19.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v0.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v23.4s, v23.4s, v0.4s\n" + "smin v24.4s, v24.4s, v30.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "smin v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v31.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v23.4s, v23.4s, v31.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "srshl v22.4s, v22.4s, v19.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "and v16.16b, v21.16b, v19.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "add v22.4s, v22.4s, v0.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v22.4s, v22.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v22.4s, v22.4s, v31.4s\n" + "srshl v21.4s, v21.4s, v19.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "add v21.4s, v21.4s, v0.4s\n" + "smin v21.4s, v21.4s, v30.4s\n" + "smax v21.4s, v21.4s, v31.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "blt 3f\n" + "str s28, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "str s26, [x24, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s22, [x20, #0x0]\n" + "str s21, [x19, #0x0]\n" + "b 4f\n" + "3:" // Tail: Oddments + "st1 { v28.b }[0], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[0], [x25], #0x1\n" + "st1 { v26.b }[0], [x24], #0x1\n" + "st1 { v25.b }[0], [x23], #0x1\n" + "st1 { v24.b }[0], [x22], #0x1\n" + "st1 { v23.b }[0], [x21], #0x1\n" + "st1 { v22.b }[0], [x20], #0x1\n" + "st1 { v21.b }[0], [x19], #0x1\n" + "beq 4f\n" + "st1 { v28.b }[1], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[1], [x25], #0x1\n" + "st1 { v26.b }[1], [x24], #0x1\n" + "st1 { v25.b }[1], [x23], #0x1\n" + "st1 { v24.b }[1], [x22], #0x1\n" + "st1 { v23.b }[1], [x21], #0x1\n" + "st1 { v22.b }[1], [x20], #0x1\n" + "st1 { v21.b }[1], [x19], #0x1\n" + "beq 4f\n" + "st1 { v28.b }[2], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[2], [x25], #0x1\n" + "st1 { v26.b }[2], [x24], #0x1\n" + "st1 { v25.b }[2], [x23], #0x1\n" + "st1 { v24.b }[2], [x22], #0x1\n" + "st1 { v23.b }[2], [x21], #0x1\n" + "st1 { v22.b }[2], [x20], #0x1\n" + "st1 { v21.b }[2], [x19], #0x1\n" + "beq 4f\n" + "st1 { v28.b }[3], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[3], [x25], #0x1\n" + "st1 { v26.b }[3], [x24], #0x1\n" + "st1 { v25.b }[3], [x23], #0x1\n" + "st1 { v24.b }[3], [x22], #0x1\n" + "st1 { v23.b }[3], [x21], #0x1\n" + "st1 { v22.b }[3], [x20], #0x1\n" + "st1 { v21.b }[3], [x19], #0x1\n" + "4:" // Tail: End + "add SP, SP, #0x80\n" + : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params) + : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp new file mode 100644 index 0000000000..d0ae00d260 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); + +struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int output_rows(void) { return 2; }; + constexpr static unsigned int output_cols(void) { return 8; }; + + constexpr static unsigned int output_col_regs(void) { return 2; }; + + kern_type kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + + a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..c0acd8805e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -0,0 +1,1484 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl( + const int8_t *const *const inptrs, + int8_t *const *const outptrs, + const int8_t *weights, + const int32_t *bias, + const unsigned int kernel_points, + const unsigned int n_output_channels, + const int32_t *per_channel_left_shifts, + const int32_t *per_channel_muls, + const int32_t *per_channel_right_shifts, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov x9, #0x0\n" + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v14.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n" + "ld1r { v13.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n" + "ld1r { v12.16b }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "ld1r { v11.16b }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v10.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n" + "ld1r { v9.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n" + "ld1r { v8.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n" + "ld1r { v7.4s }, [x19]\n" + "lsr x28, %x[n_output_channels], #0x2\n" + "cbz x28, 9f\n" + "1:" // Output channel loop + "movi v16.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x9, #0x2\n" + "ldr q16, [%x[bias], x19]\n" + "2:" // Output channel loop: Load bias: Done + "mov v6.16b, v16.16b\n" + "mov v5.16b, v16.16b\n" + "mov v4.16b, v16.16b\n" + "mov v31.16b, v16.16b\n" + "mov v30.16b, v16.16b\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "cbz %x[rq_mul_ptr], 3f\n" + "lsl x19, x9, #0x2\n" + "ldr q8, [%x[rq_mul_ptr], x19]\n" + "ldr q7, [%x[rq_right_shift_ptr], x19]\n" + "cbz %x[rq_left_shift_ptr], 3f\n" + "ldr q9, [%x[rq_left_shift_ptr], x19]\n" + "3:" // Output channel loop: Load quantization parameters: Done + "ldr s17, [%x[weights]], #0x4\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "mov x19, %x[inptrs]\n" + "ldp x25, x27, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "ldr d3, [x25, #0x0]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr d2, [x27, #0x0]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "cbz x20, 7f\n" + "ldp x25, x27, [x19], #0x10\n" + "ldr s16, [%x[weights]], #0x4\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "ldr d1, [x25, #0x0]\n" + "subs x20, x20, #0x1\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d0, [x27, #0x0]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "beq 5f\n" + "4:" // Output channel loop: Kernel loop + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "ldr d1, [x25, #0x0]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "ldr d0, [x27, #0x0]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr s16, [%x[weights]], #0x4\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "bgt 4b\n" + "5:" // Output channel loop: Kernel loop tail + "tbnz %x[kernel_points], #0, 6f\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "b 8f\n" + "6:" // Output channel loop: Odd tail + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "b 8f\n" + "7:" // Output channel loop: Single kernel point + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "8:" // Output channel loop: Done + "add x9, x9, #0x4\n" + "cmp x9, x28, LSL #2\n" + "blt 1b\n" + "tst %x[n_output_channels], #0x3\n" + "beq 26f\n" + "9:" // Output channel oddments + "movi v16.4s, #0x0\n" + "cbz %x[bias], 12f\n" + "add x19, %x[bias], x9, LSL #2\n" + "tbz %x[n_output_channels], #1, 10f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 11f\n" + "ld1 { v16.s }[2], [x19]\n" + "b 11f\n" + "10:" // Output channel oddments: Load bias: Bit 1: Unset + "tbz %x[n_output_channels], #0, 11f\n" + "ld1 { v16.s }[0], [x19]\n" + "11:" // Output channel oddments: Load bias: Bit 1: End + + "12:" // Output channel oddments: Load bias: Done + "mov v6.16b, v16.16b\n" + "mov v5.16b, v16.16b\n" + "mov v4.16b, v16.16b\n" + "mov v31.16b, v16.16b\n" + "mov v30.16b, v16.16b\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "cbz %x[rq_mul_ptr], 18f\n" + "add x21, %x[rq_mul_ptr], x9, LSL #2\n" + "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n" + "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n" + "cbz %x[rq_left_shift_ptr], 15f\n" + "tbz %x[n_output_channels], #1, 13f\n" + "ld1 { v8.d }[0], [x21], #0x8\n" + "ld1 { v7.d }[0], [x20], #0x8\n" + "ld1 { v9.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 14f\n" + "ld1 { v8.s }[2], [x21], #0x4\n" + "ld1 { v7.s }[2], [x20], #0x4\n" + "ld1 { v9.s }[2], [x19], #0x4\n" + "b 14f\n" + "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset + "tbz %x[n_output_channels], #0, 14f\n" + "ld1 { v8.s }[0], [x21], #0x4\n" + "ld1 { v7.s }[0], [x20], #0x4\n" + "ld1 { v9.s }[0], [x19], #0x4\n" + "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End + "b 18f\n" + "15:" // Output channel oddments: Load quantization parameters: No left shift + "tbz %x[n_output_channels], #1, 16f\n" + "ld1 { v8.d }[0], [x21], #0x8\n" + "ld1 { v7.d }[0], [x20], #0x8\n" + "tbz %x[n_output_channels], #0, 17f\n" + "ld1 { v8.s }[2], [x21], #0x4\n" + "ld1 { v7.s }[2], [x20], #0x4\n" + "b 17f\n" + "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset + "tbz %x[n_output_channels], #0, 17f\n" + "ld1 { v8.s }[0], [x21], #0x4\n" + "ld1 { v7.s }[0], [x20], #0x4\n" + "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End + + "18:" // Output channel oddments: Load quantization parameters: Done + "ldr s17, [%x[weights]], #0x4\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "mov x19, %x[inptrs]\n" + "ldp x25, x27, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "ldr d3, [x25, #0x0]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr d2, [x27, #0x0]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "cbz x20, 22f\n" + "ldp x25, x27, [x19], #0x10\n" + "ldr s16, [%x[weights]], #0x4\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "ldr d1, [x25, #0x0]\n" + "subs x20, x20, #0x1\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d0, [x27, #0x0]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "beq 20f\n" + "19:" // Output channel oddments: Kernel loop + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "ldr d1, [x25, #0x0]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "ldr d0, [x27, #0x0]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr s16, [%x[weights]], #0x4\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "bgt 19b\n" + "20:" // Output channel oddments: Kernel loop tail + "tbnz %x[kernel_points], #0, 21f\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "b 23f\n" + "21:" // Output channel oddments: Odd tail + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "b 23f\n" + "22:" // Output channel oddments: Single kernel point + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "23:" // Output channel oddments: Done + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "and v17.16b, v4.16b, v7.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "and v16.16b, v31.16b, v7.16b\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "add v4.4s, v4.4s, v10.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "smin v4.4s, v4.4s, v13.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "and v16.16b, v30.16b, v7.16b\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "and v17.16b, v29.16b, v7.16b\n" + "smin v31.4s, v31.4s, v13.4s\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smax v31.4s, v31.4s, v14.4s\n" + "and v16.16b, v28.16b, v7.16b\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "add v29.4s, v29.4s, v10.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smin v29.4s, v29.4s, v13.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v28.4s, v28.4s, v10.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smax v28.4s, v28.4s, v14.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "and v17.16b, v26.16b, v7.16b\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "and v16.16b, v25.16b, v7.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v27.4s, v27.4s, v13.4s\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "add v26.4s, v26.4s, v10.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "and v17.16b, v24.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "smin v25.4s, v25.4s, v13.4s\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "smax v25.4s, v25.4s, v14.4s\n" + "and v16.16b, v23.16b, v7.16b\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v24.4s, v24.4s, v13.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "and v16.16b, v21.16b, v7.16b\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "smax v23.4s, v23.4s, v14.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v22.4s, v22.4s, v13.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "add v21.4s, v21.4s, v10.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "smax v21.4s, v21.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "smin v20.4s, v20.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "tbz %x[n_output_channels], #1, 24f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.h }[0], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.h }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.h }[0], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.h }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.h }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.h }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.h }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.h }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.h }[0], [x25]\n" + "add x9, x9, #0x2\n" + "st1 { v19.h }[0], [x26]\n" + "tbz %x[n_output_channels], #0, 25f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.b }[2], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.b }[2], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.b }[2], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.b }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.b }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.b }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.b }[2], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.b }[2], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.b }[2], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.b }[2], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.b }[2], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.b }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.b }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.b }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v19.b }[2], [x26]\n" + "b 25f\n" + "24:" // Output channel oddments: Done: Store: Bit 1: Unset + "tbz %x[n_output_channels], #0, 25f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.b }[0], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.b }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.b }[0], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.b }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.b }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.b }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.b }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.b }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.b }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.b }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.b }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.b }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.b }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.b }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.b }[0], [x25]\n" + "st1 { v19.b }[0], [x26]\n" + "25:" // Output channel oddments: Done: Store: Bit 1: End + + "26:" // Done + + : [weights] "+&r" (weights) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..0fde00ba37 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + +struct a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size; + + kern_type kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + + a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..bdbda178b3 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -0,0 +1,1184 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__aarch64__) + +#include "arm_gemm.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +{ + __asm__ __volatile__( + "ldp x15, x14, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "ldp x13, x12, [%x[inptrs], #0x10]\n" + "mov x11, #0x0\n" + "ldp x10, x9, [%x[inptrs], #0x20]\n" + "lsr x28, %x[n_channels], #0x4\n" + "ldp x27, x26, [%x[inptrs], #0x30]\n" + "add x25, %x[qp], %[offsetof_Requantize32_minval]\n" + "ldp x24, x23, [%x[outptrs], #0x0]\n" + "add x22, %x[qp], %[offsetof_Requantize32_maxval]\n" + "ldp x21, x20, [%x[outptrs], #0x10]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v12.4s }, [x25]\n" + "ld1r { v11.4s }, [x22]\n" + "ld1r { v10.4s }, [x19]\n" + "cbz x28, 2f\n" + "1:" // Loop + "ldr q27, [x15, x11]\n" + "subs x28, x28, #0x1\n" + "ldr q1, [x14, x11]\n" + "ldp x15, x14, [%x[inptrs], #0x40]\n" + "ldr q25, [x13, x11]\n" + "zip1 v6.16b, v27.16b, v25.16b\n" + "ldr q23, [x12, x11]\n" + "zip2 v9.16b, v27.16b, v25.16b\n" + "ldp x13, x12, [%x[inptrs], #0x50]\n" + "ldr q31, [x10, x11]\n" + "zip1 v5.16b, v1.16b, v23.16b\n" + "ldr q28, [x9, x11]\n" + "zip2 v3.16b, v1.16b, v23.16b\n" + "ldp x10, x9, [%x[inptrs], #0x60]\n" + "zip1 v8.16b, v6.16b, v5.16b\n" + "ldr q21, [x27, x11]\n" + "zip2 v7.16b, v6.16b, v5.16b\n" + "ldr q26, [x26, x11]\n" + "zip1 v6.16b, v9.16b, v3.16b\n" + "ldp x27, x26, [%x[inptrs], #0x70]\n" + "zip2 v5.16b, v9.16b, v3.16b\n" + "ldr q24, [x15, x11]\n" + "ldr q22, [x14, x11]\n" + "zip1 v2.16b, v31.16b, v21.16b\n" + "zip2 v4.16b, v31.16b, v21.16b\n" + "ldp x15, x14, [%x[inptrs], #0x0]\n" + "zip1 v1.16b, v28.16b, v26.16b\n" + "ldr q20, [x13, x11]\n" + "zip2 v31.16b, v28.16b, v26.16b\n" + "ldr q16, [x12, x11]\n" + "zip1 v3.16b, v2.16b, v1.16b\n" + "ldp x13, x12, [%x[inptrs], #0x10]\n" + "zip2 v2.16b, v2.16b, v1.16b\n" + "ldr q19, [x10, x11]\n" + "zip1 v1.16b, v4.16b, v31.16b\n" + "ldr q0, [x9, x11]\n" + "zip1 v28.16b, v24.16b, v20.16b\n" + "ldp x10, x9, [%x[inptrs], #0x20]\n" + "zip2 v26.16b, v24.16b, v20.16b\n" + "ldr q18, [x27, x11]\n" + "zip1 v24.16b, v22.16b, v16.16b\n" + "ldr q17, [x26, x11]\n" + "zip2 v22.16b, v22.16b, v16.16b\n" + "ldp x27, x26, [%x[inptrs], #0x30]\n" + "zip2 v16.16b, v4.16b, v31.16b\n" + "str q6, [SP, #0x0]\n" + "zip1 v31.16b, v28.16b, v24.16b\n" + "str q5, [SP, #0x10]\n" + "zip1 v20.16b, v19.16b, v18.16b\n" + "str q1, [SP, #0x20]\n" + "zip2 v19.16b, v19.16b, v18.16b\n" + "str q16, [SP, #0x30]\n" + "zip1 v18.16b, v0.16b, v17.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "zip2 v17.16b, v0.16b, v17.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "zip2 v28.16b, v28.16b, v24.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "zip1 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x40]\n" + "zip2 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x50]\n" + "zip1 v26.16b, v20.16b, v18.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "zip2 v24.16b, v20.16b, v18.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x60]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x70]\n" + "mov v22.16b, v30.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "mov v20.16b, v30.16b\n" + "mov v19.16b, v30.16b\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + "ldr q8, [SP, #0x0]\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + "ldr q29, [%x[params], #0x70]\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + "ldr q3, [SP, #0x20]\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + "ldr q27, [%x[params], #0x80]\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + "ldr q31, [SP, #0x40]\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "ldr q25, [%x[params], #0x90]\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "ldr q26, [SP, #0x60]\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0xa0]\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v19.16b, v21.16b\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0xb0]\n" + "add v22.4s, v22.4s, v10.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v12.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x24, x11]\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q30, [%x[params], #0x60]\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x21, x11]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x20, x11]\n" + "mov v19.16b, v30.16b\n" + "add x11, x11, #0x4\n" + ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + "ext v7.16b, v7.16b, v7.16b, #0x1\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n" + "ldr q7, [SP, #0x10]\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + "ldr q29, [%x[params], #0xd0]\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + "ldr q2, [SP, #0x30]\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + "ldr q27, [%x[params], #0xe0]\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + "ldr q28, [SP, #0x50]\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "ldr q25, [%x[params], #0xf0]\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "ldr q24, [SP, #0x70]\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0x100]\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v19.16b, v21.16b\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0x110]\n" + "add v22.4s, v22.4s, v10.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v12.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x24, x11]\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q30, [%x[params], #0xc0]\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x21, x11]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x20, x11]\n" + "mov v19.16b, v30.16b\n" + "add x11, x11, #0x4\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + "ldr q29, [%x[params], #0x130]\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + "ldr q27, [%x[params], #0x140]\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "ldr q25, [%x[params], #0x150]\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0x160]\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v19.16b, v21.16b\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0x170]\n" + "add v22.4s, v22.4s, v10.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v12.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x24, x11]\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q30, [%x[params], #0x120]\n" + "add %x[params], %x[params], #0x180\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "mov v22.16b, v30.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x21, x11]\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "mov v20.16b, v30.16b\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x20, x11]\n" + "mov v19.16b, v30.16b\n" + "add x11, x11, #0x4\n" + ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + "ext v7.16b, v7.16b, v7.16b, #0x1\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v17.16b, v22.16b, v21.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x24, x11]\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x21, x11]\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x20, x11]\n" + "add x11, x11, #0x4\n" + "bgt 1b\n" + "tst %x[n_channels], #0xf\n" + "beq 34f\n" + "2:" // Oddments + "and x19, %x[n_channels], #0xf\n" + "add x15, x15, x11\n" + "add x14, x14, x11\n" + "add x13, x13, x11\n" + "add x12, x12, x11\n" + "add x10, x10, x11\n" + "add x9, x9, x11\n" + "add x27, x27, x11\n" + "add x26, x26, x11\n" + "tbz %x[n_channels], #3, 6f\n" + "ld1 { v27.d }[0], [x15], #0x8\n" + "ld1 { v1.d }[0], [x14], #0x8\n" + "ld1 { v25.d }[0], [x13], #0x8\n" + "ld1 { v23.d }[0], [x12], #0x8\n" + "ld1 { v31.d }[0], [x10], #0x8\n" + "ld1 { v28.d }[0], [x9], #0x8\n" + "ld1 { v21.d }[0], [x27], #0x8\n" + "ld1 { v26.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #2, 4f\n" + "ld1 { v27.s }[2], [x15], #0x4\n" + "ld1 { v1.s }[2], [x14], #0x4\n" + "ld1 { v25.s }[2], [x13], #0x4\n" + "ld1 { v23.s }[2], [x12], #0x4\n" + "ld1 { v31.s }[2], [x10], #0x4\n" + "ld1 { v28.s }[2], [x9], #0x4\n" + "ld1 { v21.s }[2], [x27], #0x4\n" + "ld1 { v26.s }[2], [x26], #0x4\n" + "tbz %x[n_channels], #1, 3f\n" + "ld1 { v27.h }[6], [x15], #0x2\n" + "ld1 { v1.h }[6], [x14], #0x2\n" + "ld1 { v25.h }[6], [x13], #0x2\n" + "ld1 { v23.h }[6], [x12], #0x2\n" + "ld1 { v31.h }[6], [x10], #0x2\n" + "ld1 { v28.h }[6], [x9], #0x2\n" + "ld1 { v21.h }[6], [x27], #0x2\n" + "ld1 { v26.h }[6], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[14], [x15], #0x1\n" + "ld1 { v1.b }[14], [x14], #0x1\n" + "ld1 { v25.b }[14], [x13], #0x1\n" + "ld1 { v23.b }[14], [x12], #0x1\n" + "ld1 { v31.b }[14], [x10], #0x1\n" + "ld1 { v28.b }[14], [x9], #0x1\n" + "ld1 { v21.b }[14], [x27], #0x1\n" + "ld1 { v26.b }[14], [x26], #0x1\n" + "b 10f\n" + "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[12], [x15], #0x1\n" + "ld1 { v1.b }[12], [x14], #0x1\n" + "ld1 { v25.b }[12], [x13], #0x1\n" + "ld1 { v23.b }[12], [x12], #0x1\n" + "ld1 { v31.b }[12], [x10], #0x1\n" + "ld1 { v28.b }[12], [x9], #0x1\n" + "ld1 { v21.b }[12], [x27], #0x1\n" + "ld1 { v26.b }[12], [x26], #0x1\n" + "b 10f\n" + "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset + "tbz %x[n_channels], #1, 5f\n" + "ld1 { v27.h }[4], [x15], #0x2\n" + "ld1 { v1.h }[4], [x14], #0x2\n" + "ld1 { v25.h }[4], [x13], #0x2\n" + "ld1 { v23.h }[4], [x12], #0x2\n" + "ld1 { v31.h }[4], [x10], #0x2\n" + "ld1 { v28.h }[4], [x9], #0x2\n" + "ld1 { v21.h }[4], [x27], #0x2\n" + "ld1 { v26.h }[4], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[10], [x15], #0x1\n" + "ld1 { v1.b }[10], [x14], #0x1\n" + "ld1 { v25.b }[10], [x13], #0x1\n" + "ld1 { v23.b }[10], [x12], #0x1\n" + "ld1 { v31.b }[10], [x10], #0x1\n" + "ld1 { v28.b }[10], [x9], #0x1\n" + "ld1 { v21.b }[10], [x27], #0x1\n" + "ld1 { v26.b }[10], [x26], #0x1\n" + "b 10f\n" + "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[8], [x15], #0x1\n" + "ld1 { v1.b }[8], [x14], #0x1\n" + "ld1 { v25.b }[8], [x13], #0x1\n" + "ld1 { v23.b }[8], [x12], #0x1\n" + "ld1 { v31.b }[8], [x10], #0x1\n" + "ld1 { v28.b }[8], [x9], #0x1\n" + "ld1 { v21.b }[8], [x27], #0x1\n" + "ld1 { v26.b }[8], [x26], #0x1\n" + "b 10f\n" + "6:" // Oddments: Load (A): Bit 3: Unset + "tbz %x[n_channels], #2, 8f\n" + "ld1 { v27.s }[0], [x15], #0x4\n" + "ld1 { v1.s }[0], [x14], #0x4\n" + "ld1 { v25.s }[0], [x13], #0x4\n" + "ld1 { v23.s }[0], [x12], #0x4\n" + "ld1 { v31.s }[0], [x10], #0x4\n" + "ld1 { v28.s }[0], [x9], #0x4\n" + "ld1 { v21.s }[0], [x27], #0x4\n" + "ld1 { v26.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #1, 7f\n" + "ld1 { v27.h }[2], [x15], #0x2\n" + "ld1 { v1.h }[2], [x14], #0x2\n" + "ld1 { v25.h }[2], [x13], #0x2\n" + "ld1 { v23.h }[2], [x12], #0x2\n" + "ld1 { v31.h }[2], [x10], #0x2\n" + "ld1 { v28.h }[2], [x9], #0x2\n" + "ld1 { v21.h }[2], [x27], #0x2\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[6], [x15], #0x1\n" + "ld1 { v1.b }[6], [x14], #0x1\n" + "ld1 { v25.b }[6], [x13], #0x1\n" + "ld1 { v23.b }[6], [x12], #0x1\n" + "ld1 { v31.b }[6], [x10], #0x1\n" + "ld1 { v28.b }[6], [x9], #0x1\n" + "ld1 { v21.b }[6], [x27], #0x1\n" + "ld1 { v26.b }[6], [x26], #0x1\n" + "b 10f\n" + "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[4], [x15], #0x1\n" + "ld1 { v1.b }[4], [x14], #0x1\n" + "ld1 { v25.b }[4], [x13], #0x1\n" + "ld1 { v23.b }[4], [x12], #0x1\n" + "ld1 { v31.b }[4], [x10], #0x1\n" + "ld1 { v28.b }[4], [x9], #0x1\n" + "ld1 { v21.b }[4], [x27], #0x1\n" + "ld1 { v26.b }[4], [x26], #0x1\n" + "b 10f\n" + "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset + "tbz %x[n_channels], #1, 9f\n" + "ld1 { v27.h }[0], [x15], #0x2\n" + "ld1 { v1.h }[0], [x14], #0x2\n" + "ld1 { v25.h }[0], [x13], #0x2\n" + "ld1 { v23.h }[0], [x12], #0x2\n" + "ld1 { v31.h }[0], [x10], #0x2\n" + "ld1 { v28.h }[0], [x9], #0x2\n" + "ld1 { v21.h }[0], [x27], #0x2\n" + "ld1 { v26.h }[0], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[2], [x15], #0x1\n" + "ld1 { v1.b }[2], [x14], #0x1\n" + "ld1 { v25.b }[2], [x13], #0x1\n" + "ld1 { v23.b }[2], [x12], #0x1\n" + "ld1 { v31.b }[2], [x10], #0x1\n" + "ld1 { v28.b }[2], [x9], #0x1\n" + "ld1 { v21.b }[2], [x27], #0x1\n" + "ld1 { v26.b }[2], [x26], #0x1\n" + "b 10f\n" + "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[0], [x15], #0x1\n" + "ld1 { v1.b }[0], [x14], #0x1\n" + "ld1 { v25.b }[0], [x13], #0x1\n" + "ld1 { v23.b }[0], [x12], #0x1\n" + "ld1 { v31.b }[0], [x10], #0x1\n" + "ld1 { v28.b }[0], [x9], #0x1\n" + "ld1 { v21.b }[0], [x27], #0x1\n" + "ld1 { v26.b }[0], [x26], #0x1\n" + "10:" // Oddments: Load (A): Bit 3: End + "ldp x15, x14, [%x[inptrs], #0x40]\n" + "add x15, x15, x11\n" + "ldp x13, x12, [%x[inptrs], #0x50]\n" + "ldp x10, x9, [%x[inptrs], #0x60]\n" + "add x14, x14, x11\n" + "ldp x27, x26, [%x[inptrs], #0x70]\n" + "add x13, x13, x11\n" + "add x12, x12, x11\n" + "add x10, x10, x11\n" + "add x9, x9, x11\n" + "add x27, x27, x11\n" + "add x26, x26, x11\n" + "tbz %x[n_channels], #3, 14f\n" + "ld1 { v24.d }[0], [x15], #0x8\n" + "ld1 { v22.d }[0], [x14], #0x8\n" + "ld1 { v20.d }[0], [x13], #0x8\n" + "ld1 { v16.d }[0], [x12], #0x8\n" + "ld1 { v19.d }[0], [x10], #0x8\n" + "ld1 { v0.d }[0], [x9], #0x8\n" + "ld1 { v18.d }[0], [x27], #0x8\n" + "ld1 { v17.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #2, 12f\n" + "ld1 { v24.s }[2], [x15], #0x4\n" + "ld1 { v22.s }[2], [x14], #0x4\n" + "ld1 { v20.s }[2], [x13], #0x4\n" + "ld1 { v16.s }[2], [x12], #0x4\n" + "ld1 { v19.s }[2], [x10], #0x4\n" + "ld1 { v0.s }[2], [x9], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "ld1 { v17.s }[2], [x26], #0x4\n" + "tbz %x[n_channels], #1, 11f\n" + "ld1 { v24.h }[6], [x15], #0x2\n" + "ld1 { v22.h }[6], [x14], #0x2\n" + "ld1 { v20.h }[6], [x13], #0x2\n" + "ld1 { v16.h }[6], [x12], #0x2\n" + "ld1 { v19.h }[6], [x10], #0x2\n" + "ld1 { v0.h }[6], [x9], #0x2\n" + "ld1 { v18.h }[6], [x27], #0x2\n" + "ld1 { v17.h }[6], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[14], [x15], #0x1\n" + "ld1 { v22.b }[14], [x14], #0x1\n" + "ld1 { v20.b }[14], [x13], #0x1\n" + "ld1 { v16.b }[14], [x12], #0x1\n" + "ld1 { v19.b }[14], [x10], #0x1\n" + "ld1 { v0.b }[14], [x9], #0x1\n" + "ld1 { v18.b }[14], [x27], #0x1\n" + "ld1 { v17.b }[14], [x26], #0x1\n" + "b 18f\n" + "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[12], [x15], #0x1\n" + "ld1 { v22.b }[12], [x14], #0x1\n" + "ld1 { v20.b }[12], [x13], #0x1\n" + "ld1 { v16.b }[12], [x12], #0x1\n" + "ld1 { v19.b }[12], [x10], #0x1\n" + "ld1 { v0.b }[12], [x9], #0x1\n" + "ld1 { v18.b }[12], [x27], #0x1\n" + "ld1 { v17.b }[12], [x26], #0x1\n" + "b 18f\n" + "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset + "tbz %x[n_channels], #1, 13f\n" + "ld1 { v24.h }[4], [x15], #0x2\n" + "ld1 { v22.h }[4], [x14], #0x2\n" + "ld1 { v20.h }[4], [x13], #0x2\n" + "ld1 { v16.h }[4], [x12], #0x2\n" + "ld1 { v19.h }[4], [x10], #0x2\n" + "ld1 { v0.h }[4], [x9], #0x2\n" + "ld1 { v18.h }[4], [x27], #0x2\n" + "ld1 { v17.h }[4], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[10], [x15], #0x1\n" + "ld1 { v22.b }[10], [x14], #0x1\n" + "ld1 { v20.b }[10], [x13], #0x1\n" + "ld1 { v16.b }[10], [x12], #0x1\n" + "ld1 { v19.b }[10], [x10], #0x1\n" + "ld1 { v0.b }[10], [x9], #0x1\n" + "ld1 { v18.b }[10], [x27], #0x1\n" + "ld1 { v17.b }[10], [x26], #0x1\n" + "b 18f\n" + "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[8], [x15], #0x1\n" + "ld1 { v22.b }[8], [x14], #0x1\n" + "ld1 { v20.b }[8], [x13], #0x1\n" + "ld1 { v16.b }[8], [x12], #0x1\n" + "ld1 { v19.b }[8], [x10], #0x1\n" + "ld1 { v0.b }[8], [x9], #0x1\n" + "ld1 { v18.b }[8], [x27], #0x1\n" + "ld1 { v17.b }[8], [x26], #0x1\n" + "b 18f\n" + "14:" // Oddments: Load (B): Bit 3: Unset + "tbz %x[n_channels], #2, 16f\n" + "ld1 { v24.s }[0], [x15], #0x4\n" + "ld1 { v22.s }[0], [x14], #0x4\n" + "ld1 { v20.s }[0], [x13], #0x4\n" + "ld1 { v16.s }[0], [x12], #0x4\n" + "ld1 { v19.s }[0], [x10], #0x4\n" + "ld1 { v0.s }[0], [x9], #0x4\n" + "ld1 { v18.s }[0], [x27], #0x4\n" + "ld1 { v17.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #1, 15f\n" + "ld1 { v24.h }[2], [x15], #0x2\n" + "ld1 { v22.h }[2], [x14], #0x2\n" + "ld1 { v20.h }[2], [x13], #0x2\n" + "ld1 { v16.h }[2], [x12], #0x2\n" + "ld1 { v19.h }[2], [x10], #0x2\n" + "ld1 { v0.h }[2], [x9], #0x2\n" + "ld1 { v18.h }[2], [x27], #0x2\n" + "ld1 { v17.h }[2], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[6], [x15], #0x1\n" + "ld1 { v22.b }[6], [x14], #0x1\n" + "ld1 { v20.b }[6], [x13], #0x1\n" + "ld1 { v16.b }[6], [x12], #0x1\n" + "ld1 { v19.b }[6], [x10], #0x1\n" + "ld1 { v0.b }[6], [x9], #0x1\n" + "ld1 { v18.b }[6], [x27], #0x1\n" + "ld1 { v17.b }[6], [x26], #0x1\n" + "b 18f\n" + "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[4], [x15], #0x1\n" + "ld1 { v22.b }[4], [x14], #0x1\n" + "ld1 { v20.b }[4], [x13], #0x1\n" + "ld1 { v16.b }[4], [x12], #0x1\n" + "ld1 { v19.b }[4], [x10], #0x1\n" + "ld1 { v0.b }[4], [x9], #0x1\n" + "ld1 { v18.b }[4], [x27], #0x1\n" + "ld1 { v17.b }[4], [x26], #0x1\n" + "b 18f\n" + "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset + "tbz %x[n_channels], #1, 17f\n" + "ld1 { v24.h }[0], [x15], #0x2\n" + "ld1 { v22.h }[0], [x14], #0x2\n" + "ld1 { v20.h }[0], [x13], #0x2\n" + "ld1 { v16.h }[0], [x12], #0x2\n" + "ld1 { v19.h }[0], [x10], #0x2\n" + "ld1 { v0.h }[0], [x9], #0x2\n" + "ld1 { v18.h }[0], [x27], #0x2\n" + "ld1 { v17.h }[0], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[2], [x15], #0x1\n" + "ld1 { v22.b }[2], [x14], #0x1\n" + "ld1 { v20.b }[2], [x13], #0x1\n" + "ld1 { v16.b }[2], [x12], #0x1\n" + "ld1 { v19.b }[2], [x10], #0x1\n" + "ld1 { v0.b }[2], [x9], #0x1\n" + "ld1 { v18.b }[2], [x27], #0x1\n" + "ld1 { v17.b }[2], [x26], #0x1\n" + "b 18f\n" + "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[0], [x15], #0x1\n" + "ld1 { v22.b }[0], [x14], #0x1\n" + "ld1 { v20.b }[0], [x13], #0x1\n" + "ld1 { v16.b }[0], [x12], #0x1\n" + "ld1 { v19.b }[0], [x10], #0x1\n" + "ld1 { v0.b }[0], [x9], #0x1\n" + "ld1 { v18.b }[0], [x27], #0x1\n" + "ld1 { v17.b }[0], [x26], #0x1\n" + "18:" // Oddments: Load (B): Bit 3: End + "zip1 v6.16b, v27.16b, v25.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "cmp x19, #0x4\n" + "zip2 v9.16b, v27.16b, v25.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "zip1 v5.16b, v1.16b, v23.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "zip2 v3.16b, v1.16b, v23.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "zip1 v2.16b, v31.16b, v21.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "zip2 v4.16b, v31.16b, v21.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "zip1 v1.16b, v28.16b, v26.16b\n" + "zip2 v31.16b, v28.16b, v26.16b\n" + "zip1 v28.16b, v24.16b, v20.16b\n" + "zip2 v26.16b, v24.16b, v20.16b\n" + "zip1 v24.16b, v22.16b, v16.16b\n" + "zip2 v22.16b, v22.16b, v16.16b\n" + "zip1 v20.16b, v19.16b, v18.16b\n" + "zip2 v19.16b, v19.16b, v18.16b\n" + "zip1 v18.16b, v0.16b, v17.16b\n" + "zip2 v17.16b, v0.16b, v17.16b\n" + "zip1 v8.16b, v6.16b, v5.16b\n" + "zip2 v7.16b, v6.16b, v5.16b\n" + "zip1 v6.16b, v9.16b, v3.16b\n" + "str q6, [SP, #0x0]\n" + "zip2 v5.16b, v9.16b, v3.16b\n" + "str q5, [SP, #0x10]\n" + "zip1 v3.16b, v2.16b, v1.16b\n" + "zip2 v2.16b, v2.16b, v1.16b\n" + "zip1 v1.16b, v4.16b, v31.16b\n" + "str q1, [SP, #0x20]\n" + "zip2 v16.16b, v4.16b, v31.16b\n" + "str q16, [SP, #0x30]\n" + "zip1 v31.16b, v28.16b, v24.16b\n" + "zip2 v28.16b, v28.16b, v24.16b\n" + "zip1 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x40]\n" + "zip2 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x50]\n" + "zip1 v26.16b, v20.16b, v18.16b\n" + "zip2 v24.16b, v20.16b, v18.16b\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x60]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x70]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v19.16b, v30.16b\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v17.16b, v22.16b, v21.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 19f\n" + "str s30, [x24, x11]\n" + "str s22, [x23, x11]\n" + "str s20, [x21, x11]\n" + "str s19, [x20, x11]\n" + "b 22f\n" + "19:" // Oddments: Unroll 0: Oddment store + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x21, x21, x11\n" + "add x20, x20, x11\n" + "tbz x19, #1, 20f\n" + "st1 { v30.h }[0], [x24], #0x2\n" + "st1 { v22.h }[0], [x23], #0x2\n" + "st1 { v20.h }[0], [x21], #0x2\n" + "st1 { v19.h }[0], [x20], #0x2\n" + "tbz x19, #0, 21f\n" + "st1 { v30.b }[2], [x24], #0x1\n" + "st1 { v22.b }[2], [x23], #0x1\n" + "st1 { v20.b }[2], [x21], #0x1\n" + "st1 { v19.b }[2], [x20], #0x1\n" + "b 21f\n" + "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset + "tbz x19, #0, 21f\n" + "st1 { v30.b }[0], [x24], #0x1\n" + "st1 { v22.b }[0], [x23], #0x1\n" + "st1 { v20.b }[0], [x21], #0x1\n" + "st1 { v19.b }[0], [x20], #0x1\n" + "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End + + "22:" // Oddments: Unroll 0: After oddment store + "add x11, x11, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "ldr q30, [%x[params], #0x0]\n" + "mov v22.16b, v30.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "cmp x19, #0x4\n" + "mov v20.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v19.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "ldr q23, [%x[params], #0x40]\n" + ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "ext v7.16b, v7.16b, v7.16b, #0x1\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v17.16b, v22.16b, v21.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 23f\n" + "str s30, [x24, x11]\n" + "str s22, [x23, x11]\n" + "str s20, [x21, x11]\n" + "str s19, [x20, x11]\n" + "b 26f\n" + "23:" // Oddments: Unroll 1: Oddment store + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x21, x21, x11\n" + "add x20, x20, x11\n" + "tbz x19, #1, 24f\n" + "st1 { v30.h }[0], [x24], #0x2\n" + "st1 { v22.h }[0], [x23], #0x2\n" + "st1 { v20.h }[0], [x21], #0x2\n" + "st1 { v19.h }[0], [x20], #0x2\n" + "tbz x19, #0, 25f\n" + "st1 { v30.b }[2], [x24], #0x1\n" + "st1 { v22.b }[2], [x23], #0x1\n" + "st1 { v20.b }[2], [x21], #0x1\n" + "st1 { v19.b }[2], [x20], #0x1\n" + "b 25f\n" + "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset + "tbz x19, #0, 25f\n" + "st1 { v30.b }[0], [x24], #0x1\n" + "st1 { v22.b }[0], [x23], #0x1\n" + "st1 { v20.b }[0], [x21], #0x1\n" + "st1 { v19.b }[0], [x20], #0x1\n" + "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End + + "26:" // Oddments: Unroll 1: After oddment store + "add x11, x11, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "ldr q8, [SP, #0x0]\n" + "ldr q3, [SP, #0x20]\n" + "cmp x19, #0x4\n" + "ldr q31, [SP, #0x40]\n" + "ldr q26, [SP, #0x60]\n" + "ldr q30, [%x[params], #0x0]\n" + "mov v22.16b, v30.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "mov v20.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v19.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "ldr q23, [%x[params], #0x40]\n" + ".inst 0x4e8897be // sdot v30.4s, v29.16b, v8.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x4e8397b4 // sdot v20.4s, v29.16b, v3.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + ".inst 0x4e83977e // sdot v30.4s, v27.16b, v3.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x4e9f9774 // sdot v20.4s, v27.16b, v31.16b\n" + ".inst 0x4e8897b6 // sdot v22.4s, v29.16b, v8.16b\n" + ".inst 0x4e9f973e // sdot v30.4s, v25.16b, v31.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + ".inst 0x4e9a9734 // sdot v20.4s, v25.16b, v26.16b\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x4e8397b3 // sdot v19.4s, v29.16b, v3.16b\n" + ".inst 0x4e839776 // sdot v22.4s, v27.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9f9773 // sdot v19.4s, v27.16b, v31.16b\n" + ".inst 0x4e9f9736 // sdot v22.4s, v25.16b, v31.16b\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e9a9733 // sdot v19.4s, v25.16b, v26.16b\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v17.16b, v22.16b, v21.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 27f\n" + "str s30, [x24, x11]\n" + "str s22, [x23, x11]\n" + "str s20, [x21, x11]\n" + "str s19, [x20, x11]\n" + "b 30f\n" + "27:" // Oddments: Unroll 2: Oddment store + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x21, x21, x11\n" + "add x20, x20, x11\n" + "tbz x19, #1, 28f\n" + "st1 { v30.h }[0], [x24], #0x2\n" + "st1 { v22.h }[0], [x23], #0x2\n" + "st1 { v20.h }[0], [x21], #0x2\n" + "st1 { v19.h }[0], [x20], #0x2\n" + "tbz x19, #0, 29f\n" + "st1 { v30.b }[2], [x24], #0x1\n" + "st1 { v22.b }[2], [x23], #0x1\n" + "st1 { v20.b }[2], [x21], #0x1\n" + "st1 { v19.b }[2], [x20], #0x1\n" + "b 29f\n" + "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset + "tbz x19, #0, 29f\n" + "st1 { v30.b }[0], [x24], #0x1\n" + "st1 { v22.b }[0], [x23], #0x1\n" + "st1 { v20.b }[0], [x21], #0x1\n" + "st1 { v19.b }[0], [x20], #0x1\n" + "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End + + "30:" // Oddments: Unroll 2: After oddment store + "add x11, x11, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "ldr q7, [SP, #0x10]\n" + "ldr q2, [SP, #0x30]\n" + "ldr q28, [SP, #0x50]\n" + "ldr q24, [SP, #0x70]\n" + "ldr q30, [%x[params], #0x0]\n" + "mov v22.16b, v30.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "mov v20.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v19.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "ldr q23, [%x[params], #0x40]\n" + ".inst 0x4e8797be // sdot v30.4s, v29.16b, v7.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x4e8297b4 // sdot v20.4s, v29.16b, v2.16b\n" + "ext v7.16b, v7.16b, v7.16b, #0x1\n" + ".inst 0x4e82977e // sdot v30.4s, v27.16b, v2.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + ".inst 0x4e9c9774 // sdot v20.4s, v27.16b, v28.16b\n" + ".inst 0x4e8797b6 // sdot v22.4s, v29.16b, v7.16b\n" + ".inst 0x4e9c973e // sdot v30.4s, v25.16b, v28.16b\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + ".inst 0x4e989734 // sdot v20.4s, v25.16b, v24.16b\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x4e8297b3 // sdot v19.4s, v29.16b, v2.16b\n" + ".inst 0x4e829776 // sdot v22.4s, v27.16b, v2.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x4e9c9773 // sdot v19.4s, v27.16b, v28.16b\n" + ".inst 0x4e9c9736 // sdot v22.4s, v25.16b, v28.16b\n" + "and v16.16b, v30.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x4e989733 // sdot v19.4s, v25.16b, v24.16b\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "and v18.16b, v20.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v17.16b, v22.16b, v21.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v18.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "add v30.4s, v30.4s, v10.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v10.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "smin v30.4s, v30.4s, v11.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "31:" // Oddments: Unroll 3: Oddment store + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x21, x21, x11\n" + "add x20, x20, x11\n" + "tbz x19, #1, 32f\n" + "st1 { v30.h }[0], [x24], #0x2\n" + "st1 { v22.h }[0], [x23], #0x2\n" + "st1 { v20.h }[0], [x21], #0x2\n" + "st1 { v19.h }[0], [x20], #0x2\n" + "tbz x19, #0, 33f\n" + "st1 { v30.b }[2], [x24], #0x1\n" + "st1 { v22.b }[2], [x23], #0x1\n" + "st1 { v20.b }[2], [x21], #0x1\n" + "st1 { v19.b }[2], [x20], #0x1\n" + "b 33f\n" + "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset + "tbz x19, #0, 33f\n" + "st1 { v30.b }[0], [x24], #0x1\n" + "st1 { v22.b }[0], [x23], #0x1\n" + "st1 { v20.b }[0], [x21], #0x1\n" + "st1 { v19.b }[0], [x20], #0x1\n" + "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End + + "34:" // End + "add SP, SP, #0x80\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..05eddd1853 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + +struct a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst +{ + typedef uint32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_dot::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_dot::get_packed_size; + + kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + + a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..22c584f8e7 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -0,0 +1,1318 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__aarch64__) + +#include "arm_gemm.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +{ + __asm__ __volatile__( + "ldp x13, x12, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "ldp x11, x10, [%x[inptrs], #0x10]\n" + "mov x19, #0x1\n" + "ldp x9, x28, [%x[inptrs], #0x20]\n" + "orr x19, x19, #0x100\n" + "ldp x27, x26, [%x[inptrs], #0x30]\n" + "orr x19, x19, #0x10000\n" + "dup v11.4s, w19\n" + "ldp x25, x24, [%x[outptrs], #0x0]\n" + "mov x23, #0x0\n" + "ldp x22, x21, [%x[outptrs], #0x10]\n" + "lsr x20, %x[n_channels], #0x4\n" + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v9.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n" + "ld1r { v12.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "ld1r { v14.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v13.4s }, [x19]\n" + "cbz x20, 2f\n" + "1:" // Loop + "movi v15.4s, #0x0\n" + "ldr q27, [x13, x23]\n" + "subs x20, x20, #0x1\n" + "movi v10.4s, #0x0\n" + "ldr q1, [x12, x23]\n" + "ldp x13, x12, [%x[inptrs], #0x40]\n" + "ldr q25, [x11, x23]\n" + "zip1 v7.16b, v27.16b, v25.16b\n" + "ldr q23, [x10, x23]\n" + "zip2 v5.16b, v27.16b, v25.16b\n" + "ldp x11, x10, [%x[inptrs], #0x50]\n" + "ldr q31, [x9, x23]\n" + "zip1 v8.16b, v1.16b, v23.16b\n" + "ldr q28, [x28, x23]\n" + "zip2 v3.16b, v1.16b, v23.16b\n" + "ldp x9, x28, [%x[inptrs], #0x60]\n" + "zip1 v6.16b, v7.16b, v8.16b\n" + "ldr q21, [x27, x23]\n" + "zip2 v8.16b, v7.16b, v8.16b\n" + "ldr q26, [x26, x23]\n" + "zip1 v7.16b, v5.16b, v3.16b\n" + "ldp x27, x26, [%x[inptrs], #0x70]\n" + "zip2 v5.16b, v5.16b, v3.16b\n" + "ldr q24, [x13, x23]\n" + "ldr q22, [x12, x23]\n" + "zip1 v2.16b, v31.16b, v21.16b\n" + "zip2 v4.16b, v31.16b, v21.16b\n" + "ldp x13, x12, [%x[inptrs], #0x0]\n" + "zip1 v1.16b, v28.16b, v26.16b\n" + "ldr q20, [x11, x23]\n" + "zip2 v31.16b, v28.16b, v26.16b\n" + "ldr q16, [x10, x23]\n" + "zip1 v3.16b, v2.16b, v1.16b\n" + "ldp x11, x10, [%x[inptrs], #0x10]\n" + "zip2 v2.16b, v2.16b, v1.16b\n" + "ldr q19, [x9, x23]\n" + "zip1 v1.16b, v4.16b, v31.16b\n" + "ldr q0, [x28, x23]\n" + "zip1 v28.16b, v24.16b, v20.16b\n" + "ldp x9, x28, [%x[inptrs], #0x20]\n" + "zip2 v26.16b, v24.16b, v20.16b\n" + "ldr q18, [x27, x23]\n" + "zip1 v24.16b, v22.16b, v16.16b\n" + "ldr q17, [x26, x23]\n" + "zip2 v22.16b, v22.16b, v16.16b\n" + "ldp x27, x26, [%x[inptrs], #0x30]\n" + "zip2 v16.16b, v4.16b, v31.16b\n" + "str q7, [SP, #0x0]\n" + "zip1 v31.16b, v28.16b, v24.16b\n" + "str q5, [SP, #0x10]\n" + "zip1 v20.16b, v19.16b, v18.16b\n" + "str q1, [SP, #0x20]\n" + "zip2 v19.16b, v19.16b, v18.16b\n" + "str q16, [SP, #0x30]\n" + "zip1 v18.16b, v0.16b, v17.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "zip2 v17.16b, v0.16b, v17.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "zip2 v28.16b, v28.16b, v24.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "zip1 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x40]\n" + "zip2 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x50]\n" + "zip1 v26.16b, v20.16b, v18.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "zip2 v24.16b, v20.16b, v18.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x60]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x70]\n" + "mov v22.16b, v30.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "mov v20.16b, v30.16b\n" + "mov v19.16b, v30.16b\n" + ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n" + ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n" + ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n" + ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n" + ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n" + ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n" + ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n" + "ldr q29, [%x[params], #0x70]\n" + ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n" + "ldr q3, [SP, #0x20]\n" + ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n" + "ldr q27, [%x[params], #0x80]\n" + ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n" + "ldr q31, [SP, #0x40]\n" + ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n" + "ldr q25, [%x[params], #0x90]\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n" + "ldr q6, [SP, #0x0]\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "ldr q26, [SP, #0x60]\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "movi v15.4s, #0x0\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0xa0]\n" + ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "mov v17.16b, v15.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "ldr q30, [%x[params], #0x60]\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0xb0]\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "add v19.4s, v19.4s, v13.4s\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x22, x23]\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "mov v19.16b, v30.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n" + ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + "movi v10.4s, #0x0\n" + ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n" + ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + "mls v30.4s, v15.4s, v14.4s\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n" + ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n" + "ldr q29, [%x[params], #0xd0]\n" + ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n" + "ldr q2, [SP, #0x30]\n" + ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n" + "ldr q27, [%x[params], #0xe0]\n" + ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n" + "ldr q28, [SP, #0x50]\n" + ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n" + "ldr q25, [%x[params], #0xf0]\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n" + "ldr q8, [SP, #0x10]\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "ldr q24, [SP, #0x70]\n" + "and v18.16b, v30.16b, v21.16b\n" + "mls v19.4s, v17.4s, v14.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "movi v15.4s, #0x0\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n" + "movi v10.4s, #0x0\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0x100]\n" + ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "mov v17.16b, v15.16b\n" + "smax v30.4s, v30.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0x110]\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "ldr q30, [%x[params], #0xc0]\n" + "add v19.4s, v19.4s, v13.4s\n" + "str s20, [x22, x23]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "smax v19.4s, v19.4s, v9.4s\n" + ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "mov v19.16b, v30.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n" + ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n" + ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + "mls v30.4s, v15.4s, v14.4s\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n" + ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n" + "ldr q29, [%x[params], #0x130]\n" + ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n" + ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n" + "ldr q27, [%x[params], #0x140]\n" + ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n" + ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n" + "ldr q25, [%x[params], #0x150]\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v16.16b, v20.16b, v21.16b\n" + "movi v15.4s, #0x0\n" + "mls v19.4s, v17.4s, v14.4s\n" + ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "movi v10.4s, #0x0\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "ldr q23, [%x[params], #0x160]\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "mov v17.16b, v15.16b\n" + "smax v30.4s, v30.4s, v9.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v9.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "ldr q21, [%x[params], #0x170]\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "ldr q30, [%x[params], #0x120]\n" + "add %x[params], %x[params], #0x180\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x22, x23]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "smin v19.4s, v19.4s, v12.4s\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "mov v19.16b, v30.16b\n" + "add x23, x23, #0x4\n" + ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n" + ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n" + ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + "mls v30.4s, v15.4s, v14.4s\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n" + ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n" + ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n" + ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n" + ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n" + ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "and v16.16b, v20.16b, v21.16b\n" + "mls v19.4s, v17.4s, v14.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v22.4s, v22.4s, v9.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x25, x23]\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x24, x23]\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x22, x23]\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x21, x23]\n" + "add x23, x23, #0x4\n" + "bgt 1b\n" + "tst %x[n_channels], #0xf\n" + "beq 34f\n" + "2:" // Oddments + "and x19, %x[n_channels], #0xf\n" + "add x13, x13, x23\n" + "add x12, x12, x23\n" + "add x11, x11, x23\n" + "add x10, x10, x23\n" + "add x9, x9, x23\n" + "add x28, x28, x23\n" + "add x27, x27, x23\n" + "add x26, x26, x23\n" + "tbz %x[n_channels], #3, 6f\n" + "ld1 { v27.d }[0], [x13], #0x8\n" + "ld1 { v1.d }[0], [x12], #0x8\n" + "ld1 { v25.d }[0], [x11], #0x8\n" + "ld1 { v23.d }[0], [x10], #0x8\n" + "ld1 { v31.d }[0], [x9], #0x8\n" + "ld1 { v28.d }[0], [x28], #0x8\n" + "ld1 { v21.d }[0], [x27], #0x8\n" + "ld1 { v26.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #2, 4f\n" + "ld1 { v27.s }[2], [x13], #0x4\n" + "ld1 { v1.s }[2], [x12], #0x4\n" + "ld1 { v25.s }[2], [x11], #0x4\n" + "ld1 { v23.s }[2], [x10], #0x4\n" + "ld1 { v31.s }[2], [x9], #0x4\n" + "ld1 { v28.s }[2], [x28], #0x4\n" + "ld1 { v21.s }[2], [x27], #0x4\n" + "ld1 { v26.s }[2], [x26], #0x4\n" + "tbz %x[n_channels], #1, 3f\n" + "ld1 { v27.h }[6], [x13], #0x2\n" + "ld1 { v1.h }[6], [x12], #0x2\n" + "ld1 { v25.h }[6], [x11], #0x2\n" + "ld1 { v23.h }[6], [x10], #0x2\n" + "ld1 { v31.h }[6], [x9], #0x2\n" + "ld1 { v28.h }[6], [x28], #0x2\n" + "ld1 { v21.h }[6], [x27], #0x2\n" + "ld1 { v26.h }[6], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[14], [x13], #0x1\n" + "ld1 { v1.b }[14], [x12], #0x1\n" + "ld1 { v25.b }[14], [x11], #0x1\n" + "ld1 { v23.b }[14], [x10], #0x1\n" + "ld1 { v31.b }[14], [x9], #0x1\n" + "ld1 { v28.b }[14], [x28], #0x1\n" + "ld1 { v21.b }[14], [x27], #0x1\n" + "ld1 { v26.b }[14], [x26], #0x1\n" + "b 10f\n" + "3:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[12], [x13], #0x1\n" + "ld1 { v1.b }[12], [x12], #0x1\n" + "ld1 { v25.b }[12], [x11], #0x1\n" + "ld1 { v23.b }[12], [x10], #0x1\n" + "ld1 { v31.b }[12], [x9], #0x1\n" + "ld1 { v28.b }[12], [x28], #0x1\n" + "ld1 { v21.b }[12], [x27], #0x1\n" + "ld1 { v26.b }[12], [x26], #0x1\n" + "b 10f\n" + "4:" // Oddments: Load (A): Bit 3: Bit 2: Unset + "tbz %x[n_channels], #1, 5f\n" + "ld1 { v27.h }[4], [x13], #0x2\n" + "ld1 { v1.h }[4], [x12], #0x2\n" + "ld1 { v25.h }[4], [x11], #0x2\n" + "ld1 { v23.h }[4], [x10], #0x2\n" + "ld1 { v31.h }[4], [x9], #0x2\n" + "ld1 { v28.h }[4], [x28], #0x2\n" + "ld1 { v21.h }[4], [x27], #0x2\n" + "ld1 { v26.h }[4], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[10], [x13], #0x1\n" + "ld1 { v1.b }[10], [x12], #0x1\n" + "ld1 { v25.b }[10], [x11], #0x1\n" + "ld1 { v23.b }[10], [x10], #0x1\n" + "ld1 { v31.b }[10], [x9], #0x1\n" + "ld1 { v28.b }[10], [x28], #0x1\n" + "ld1 { v21.b }[10], [x27], #0x1\n" + "ld1 { v26.b }[10], [x26], #0x1\n" + "b 10f\n" + "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[8], [x13], #0x1\n" + "ld1 { v1.b }[8], [x12], #0x1\n" + "ld1 { v25.b }[8], [x11], #0x1\n" + "ld1 { v23.b }[8], [x10], #0x1\n" + "ld1 { v31.b }[8], [x9], #0x1\n" + "ld1 { v28.b }[8], [x28], #0x1\n" + "ld1 { v21.b }[8], [x27], #0x1\n" + "ld1 { v26.b }[8], [x26], #0x1\n" + "b 10f\n" + "6:" // Oddments: Load (A): Bit 3: Unset + "tbz %x[n_channels], #2, 8f\n" + "ld1 { v27.s }[0], [x13], #0x4\n" + "ld1 { v1.s }[0], [x12], #0x4\n" + "ld1 { v25.s }[0], [x11], #0x4\n" + "ld1 { v23.s }[0], [x10], #0x4\n" + "ld1 { v31.s }[0], [x9], #0x4\n" + "ld1 { v28.s }[0], [x28], #0x4\n" + "ld1 { v21.s }[0], [x27], #0x4\n" + "ld1 { v26.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #1, 7f\n" + "ld1 { v27.h }[2], [x13], #0x2\n" + "ld1 { v1.h }[2], [x12], #0x2\n" + "ld1 { v25.h }[2], [x11], #0x2\n" + "ld1 { v23.h }[2], [x10], #0x2\n" + "ld1 { v31.h }[2], [x9], #0x2\n" + "ld1 { v28.h }[2], [x28], #0x2\n" + "ld1 { v21.h }[2], [x27], #0x2\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[6], [x13], #0x1\n" + "ld1 { v1.b }[6], [x12], #0x1\n" + "ld1 { v25.b }[6], [x11], #0x1\n" + "ld1 { v23.b }[6], [x10], #0x1\n" + "ld1 { v31.b }[6], [x9], #0x1\n" + "ld1 { v28.b }[6], [x28], #0x1\n" + "ld1 { v21.b }[6], [x27], #0x1\n" + "ld1 { v26.b }[6], [x26], #0x1\n" + "b 10f\n" + "7:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[4], [x13], #0x1\n" + "ld1 { v1.b }[4], [x12], #0x1\n" + "ld1 { v25.b }[4], [x11], #0x1\n" + "ld1 { v23.b }[4], [x10], #0x1\n" + "ld1 { v31.b }[4], [x9], #0x1\n" + "ld1 { v28.b }[4], [x28], #0x1\n" + "ld1 { v21.b }[4], [x27], #0x1\n" + "ld1 { v26.b }[4], [x26], #0x1\n" + "b 10f\n" + "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset + "tbz %x[n_channels], #1, 9f\n" + "ld1 { v27.h }[0], [x13], #0x2\n" + "ld1 { v1.h }[0], [x12], #0x2\n" + "ld1 { v25.h }[0], [x11], #0x2\n" + "ld1 { v23.h }[0], [x10], #0x2\n" + "ld1 { v31.h }[0], [x9], #0x2\n" + "ld1 { v28.h }[0], [x28], #0x2\n" + "ld1 { v21.h }[0], [x27], #0x2\n" + "ld1 { v26.h }[0], [x26], #0x2\n" + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[2], [x13], #0x1\n" + "ld1 { v1.b }[2], [x12], #0x1\n" + "ld1 { v25.b }[2], [x11], #0x1\n" + "ld1 { v23.b }[2], [x10], #0x1\n" + "ld1 { v31.b }[2], [x9], #0x1\n" + "ld1 { v28.b }[2], [x28], #0x1\n" + "ld1 { v21.b }[2], [x27], #0x1\n" + "ld1 { v26.b }[2], [x26], #0x1\n" + "b 10f\n" + "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 10f\n" + "ld1 { v27.b }[0], [x13], #0x1\n" + "ld1 { v1.b }[0], [x12], #0x1\n" + "ld1 { v25.b }[0], [x11], #0x1\n" + "ld1 { v23.b }[0], [x10], #0x1\n" + "ld1 { v31.b }[0], [x9], #0x1\n" + "ld1 { v28.b }[0], [x28], #0x1\n" + "ld1 { v21.b }[0], [x27], #0x1\n" + "ld1 { v26.b }[0], [x26], #0x1\n" + "10:" // Oddments: Load (A): Bit 3: End + "ldp x13, x12, [%x[inptrs], #0x40]\n" + "add x13, x13, x23\n" + "ldp x11, x10, [%x[inptrs], #0x50]\n" + "ldp x9, x28, [%x[inptrs], #0x60]\n" + "add x12, x12, x23\n" + "ldp x27, x26, [%x[inptrs], #0x70]\n" + "add x11, x11, x23\n" + "add x10, x10, x23\n" + "add x9, x9, x23\n" + "add x28, x28, x23\n" + "add x27, x27, x23\n" + "add x26, x26, x23\n" + "tbz %x[n_channels], #3, 14f\n" + "ld1 { v24.d }[0], [x13], #0x8\n" + "ld1 { v22.d }[0], [x12], #0x8\n" + "ld1 { v20.d }[0], [x11], #0x8\n" + "ld1 { v16.d }[0], [x10], #0x8\n" + "ld1 { v19.d }[0], [x9], #0x8\n" + "ld1 { v0.d }[0], [x28], #0x8\n" + "ld1 { v18.d }[0], [x27], #0x8\n" + "ld1 { v17.d }[0], [x26], #0x8\n" + "tbz %x[n_channels], #2, 12f\n" + "ld1 { v24.s }[2], [x13], #0x4\n" + "ld1 { v22.s }[2], [x12], #0x4\n" + "ld1 { v20.s }[2], [x11], #0x4\n" + "ld1 { v16.s }[2], [x10], #0x4\n" + "ld1 { v19.s }[2], [x9], #0x4\n" + "ld1 { v0.s }[2], [x28], #0x4\n" + "ld1 { v18.s }[2], [x27], #0x4\n" + "ld1 { v17.s }[2], [x26], #0x4\n" + "tbz %x[n_channels], #1, 11f\n" + "ld1 { v24.h }[6], [x13], #0x2\n" + "ld1 { v22.h }[6], [x12], #0x2\n" + "ld1 { v20.h }[6], [x11], #0x2\n" + "ld1 { v16.h }[6], [x10], #0x2\n" + "ld1 { v19.h }[6], [x9], #0x2\n" + "ld1 { v0.h }[6], [x28], #0x2\n" + "ld1 { v18.h }[6], [x27], #0x2\n" + "ld1 { v17.h }[6], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[14], [x13], #0x1\n" + "ld1 { v22.b }[14], [x12], #0x1\n" + "ld1 { v20.b }[14], [x11], #0x1\n" + "ld1 { v16.b }[14], [x10], #0x1\n" + "ld1 { v19.b }[14], [x9], #0x1\n" + "ld1 { v0.b }[14], [x28], #0x1\n" + "ld1 { v18.b }[14], [x27], #0x1\n" + "ld1 { v17.b }[14], [x26], #0x1\n" + "b 18f\n" + "11:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[12], [x13], #0x1\n" + "ld1 { v22.b }[12], [x12], #0x1\n" + "ld1 { v20.b }[12], [x11], #0x1\n" + "ld1 { v16.b }[12], [x10], #0x1\n" + "ld1 { v19.b }[12], [x9], #0x1\n" + "ld1 { v0.b }[12], [x28], #0x1\n" + "ld1 { v18.b }[12], [x27], #0x1\n" + "ld1 { v17.b }[12], [x26], #0x1\n" + "b 18f\n" + "12:" // Oddments: Load (B): Bit 3: Bit 2: Unset + "tbz %x[n_channels], #1, 13f\n" + "ld1 { v24.h }[4], [x13], #0x2\n" + "ld1 { v22.h }[4], [x12], #0x2\n" + "ld1 { v20.h }[4], [x11], #0x2\n" + "ld1 { v16.h }[4], [x10], #0x2\n" + "ld1 { v19.h }[4], [x9], #0x2\n" + "ld1 { v0.h }[4], [x28], #0x2\n" + "ld1 { v18.h }[4], [x27], #0x2\n" + "ld1 { v17.h }[4], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[10], [x13], #0x1\n" + "ld1 { v22.b }[10], [x12], #0x1\n" + "ld1 { v20.b }[10], [x11], #0x1\n" + "ld1 { v16.b }[10], [x10], #0x1\n" + "ld1 { v19.b }[10], [x9], #0x1\n" + "ld1 { v0.b }[10], [x28], #0x1\n" + "ld1 { v18.b }[10], [x27], #0x1\n" + "ld1 { v17.b }[10], [x26], #0x1\n" + "b 18f\n" + "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[8], [x13], #0x1\n" + "ld1 { v22.b }[8], [x12], #0x1\n" + "ld1 { v20.b }[8], [x11], #0x1\n" + "ld1 { v16.b }[8], [x10], #0x1\n" + "ld1 { v19.b }[8], [x9], #0x1\n" + "ld1 { v0.b }[8], [x28], #0x1\n" + "ld1 { v18.b }[8], [x27], #0x1\n" + "ld1 { v17.b }[8], [x26], #0x1\n" + "b 18f\n" + "14:" // Oddments: Load (B): Bit 3: Unset + "tbz %x[n_channels], #2, 16f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "ld1 { v22.s }[0], [x12], #0x4\n" + "ld1 { v20.s }[0], [x11], #0x4\n" + "ld1 { v16.s }[0], [x10], #0x4\n" + "ld1 { v19.s }[0], [x9], #0x4\n" + "ld1 { v0.s }[0], [x28], #0x4\n" + "ld1 { v18.s }[0], [x27], #0x4\n" + "ld1 { v17.s }[0], [x26], #0x4\n" + "tbz %x[n_channels], #1, 15f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "ld1 { v22.h }[2], [x12], #0x2\n" + "ld1 { v20.h }[2], [x11], #0x2\n" + "ld1 { v16.h }[2], [x10], #0x2\n" + "ld1 { v19.h }[2], [x9], #0x2\n" + "ld1 { v0.h }[2], [x28], #0x2\n" + "ld1 { v18.h }[2], [x27], #0x2\n" + "ld1 { v17.h }[2], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[6], [x13], #0x1\n" + "ld1 { v22.b }[6], [x12], #0x1\n" + "ld1 { v20.b }[6], [x11], #0x1\n" + "ld1 { v16.b }[6], [x10], #0x1\n" + "ld1 { v19.b }[6], [x9], #0x1\n" + "ld1 { v0.b }[6], [x28], #0x1\n" + "ld1 { v18.b }[6], [x27], #0x1\n" + "ld1 { v17.b }[6], [x26], #0x1\n" + "b 18f\n" + "15:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[4], [x13], #0x1\n" + "ld1 { v22.b }[4], [x12], #0x1\n" + "ld1 { v20.b }[4], [x11], #0x1\n" + "ld1 { v16.b }[4], [x10], #0x1\n" + "ld1 { v19.b }[4], [x9], #0x1\n" + "ld1 { v0.b }[4], [x28], #0x1\n" + "ld1 { v18.b }[4], [x27], #0x1\n" + "ld1 { v17.b }[4], [x26], #0x1\n" + "b 18f\n" + "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset + "tbz %x[n_channels], #1, 17f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "ld1 { v22.h }[0], [x12], #0x2\n" + "ld1 { v20.h }[0], [x11], #0x2\n" + "ld1 { v16.h }[0], [x10], #0x2\n" + "ld1 { v19.h }[0], [x9], #0x2\n" + "ld1 { v0.h }[0], [x28], #0x2\n" + "ld1 { v18.h }[0], [x27], #0x2\n" + "ld1 { v17.h }[0], [x26], #0x2\n" + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[2], [x13], #0x1\n" + "ld1 { v22.b }[2], [x12], #0x1\n" + "ld1 { v20.b }[2], [x11], #0x1\n" + "ld1 { v16.b }[2], [x10], #0x1\n" + "ld1 { v19.b }[2], [x9], #0x1\n" + "ld1 { v0.b }[2], [x28], #0x1\n" + "ld1 { v18.b }[2], [x27], #0x1\n" + "ld1 { v17.b }[2], [x26], #0x1\n" + "b 18f\n" + "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset + "tbz %x[n_channels], #0, 18f\n" + "ld1 { v24.b }[0], [x13], #0x1\n" + "ld1 { v22.b }[0], [x12], #0x1\n" + "ld1 { v20.b }[0], [x11], #0x1\n" + "ld1 { v16.b }[0], [x10], #0x1\n" + "ld1 { v19.b }[0], [x9], #0x1\n" + "ld1 { v0.b }[0], [x28], #0x1\n" + "ld1 { v18.b }[0], [x27], #0x1\n" + "ld1 { v17.b }[0], [x26], #0x1\n" + "18:" // Oddments: Load (B): Bit 3: End + "zip1 v7.16b, v27.16b, v25.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "cmp x19, #0x4\n" + "zip2 v5.16b, v27.16b, v25.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "zip1 v8.16b, v1.16b, v23.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "zip2 v3.16b, v1.16b, v23.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "zip1 v2.16b, v31.16b, v21.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "zip2 v4.16b, v31.16b, v21.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "zip1 v1.16b, v28.16b, v26.16b\n" + "zip2 v31.16b, v28.16b, v26.16b\n" + "zip1 v28.16b, v24.16b, v20.16b\n" + "zip2 v26.16b, v24.16b, v20.16b\n" + "zip1 v24.16b, v22.16b, v16.16b\n" + "zip2 v22.16b, v22.16b, v16.16b\n" + "zip1 v20.16b, v19.16b, v18.16b\n" + "zip2 v19.16b, v19.16b, v18.16b\n" + "zip1 v18.16b, v0.16b, v17.16b\n" + "zip2 v17.16b, v0.16b, v17.16b\n" + "zip1 v6.16b, v7.16b, v8.16b\n" + "zip2 v8.16b, v7.16b, v8.16b\n" + "zip1 v7.16b, v5.16b, v3.16b\n" + "str q7, [SP, #0x0]\n" + "zip2 v5.16b, v5.16b, v3.16b\n" + "str q5, [SP, #0x10]\n" + "zip1 v3.16b, v2.16b, v1.16b\n" + "zip2 v2.16b, v2.16b, v1.16b\n" + "zip1 v1.16b, v4.16b, v31.16b\n" + "str q1, [SP, #0x20]\n" + "zip2 v16.16b, v4.16b, v31.16b\n" + "str q16, [SP, #0x30]\n" + "zip1 v31.16b, v28.16b, v24.16b\n" + "zip2 v28.16b, v28.16b, v24.16b\n" + "zip1 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x40]\n" + "zip2 v16.16b, v26.16b, v22.16b\n" + "str q16, [SP, #0x50]\n" + "zip1 v26.16b, v20.16b, v18.16b\n" + "zip2 v24.16b, v20.16b, v18.16b\n" + "zip1 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x60]\n" + "zip2 v16.16b, v19.16b, v17.16b\n" + "str q16, [SP, #0x70]\n" + "mov v22.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v19.16b, v30.16b\n" + ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n" + ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n" + "movi v15.4s, #0x0\n" + ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n" + ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n" + ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n" + ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n" + ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + "mov v17.16b, v15.16b\n" + ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n" + ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n" + "movi v10.4s, #0x0\n" + ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n" + ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n" + ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n" + ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n" + ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "add v30.4s, v30.4s, v13.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "smin v30.4s, v30.4s, v12.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smax v20.4s, v20.4s, v9.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v22.4s, v22.4s, v13.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 19f\n" + "str s30, [x25, x23]\n" + "str s22, [x24, x23]\n" + "str s20, [x22, x23]\n" + "str s19, [x21, x23]\n" + "b 22f\n" + "19:" // Oddments: Unroll 0: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 20f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 21f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 21f\n" + "20:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset + "tbz x19, #0, 21f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "21:" // Oddments: Unroll 0: Oddment store: Bit 1: End + + "22:" // Oddments: Unroll 0: After oddment store + "add x23, x23, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "movi v15.4s, #0x0\n" + "ldr q30, [%x[params], #0x0]\n" + ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "cmp x19, #0x4\n" + "movi v10.4s, #0x0\n" + "ldr q27, [%x[params], #0x20]\n" + "ldr q25, [%x[params], #0x30]\n" + "mov v22.16b, v30.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "mov v20.16b, v30.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "mov v19.16b, v30.16b\n" + ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n" + ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n" + ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n" + ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n" + ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n" + ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n" + ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n" + ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n" + ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n" + ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v30.4s, v30.4s, v12.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 23f\n" + "str s30, [x25, x23]\n" + "str s22, [x24, x23]\n" + "str s20, [x22, x23]\n" + "str s19, [x21, x23]\n" + "b 26f\n" + "23:" // Oddments: Unroll 1: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 24f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 25f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 25f\n" + "24:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset + "tbz x19, #0, 25f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "25:" // Oddments: Unroll 1: Oddment store: Bit 1: End + + "26:" // Oddments: Unroll 1: After oddment store + "add x23, x23, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "movi v15.4s, #0x0\n" + "ldr q6, [SP, #0x0]\n" + "movi v10.4s, #0x0\n" + "ldr q3, [SP, #0x20]\n" + "cmp x19, #0x4\n" + ".inst 0x6e83956f // udot v15.4s, v11.16b, v3.16b\n" + "ldr q31, [SP, #0x40]\n" + "ldr q26, [SP, #0x60]\n" + ".inst 0x6e9f956f // udot v15.4s, v11.16b, v31.16b\n" + "ldr q30, [%x[params], #0x0]\n" + "ldr q29, [%x[params], #0x10]\n" + "mov v22.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v20.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + "mov v19.16b, v30.16b\n" + "ldr q23, [%x[params], #0x40]\n" + ".inst 0x6e8697be // udot v30.4s, v29.16b, v6.16b\n" + "ldr q21, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x6e8397b4 // udot v20.4s, v29.16b, v3.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x6e86956f // udot v15.4s, v11.16b, v6.16b\n" + ".inst 0x6e83977e // udot v30.4s, v27.16b, v3.16b\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + ".inst 0x6e9f9774 // udot v20.4s, v27.16b, v31.16b\n" + "ext v6.16b, v6.16b, v6.16b, #0x1\n" + ".inst 0x6e9f973e // udot v30.4s, v25.16b, v31.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x6e9a9734 // udot v20.4s, v25.16b, v26.16b\n" + "ext v3.16b, v3.16b, v3.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v31.16b, v31.16b, v31.16b, #0x1\n" + "ext v26.16b, v26.16b, v26.16b, #0x1\n" + ".inst 0x6e8697b6 // udot v22.4s, v29.16b, v6.16b\n" + ".inst 0x6e8397b3 // udot v19.4s, v29.16b, v3.16b\n" + ".inst 0x6e83956a // udot v10.4s, v11.16b, v3.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e839776 // udot v22.4s, v27.16b, v3.16b\n" + ".inst 0x6e9f9773 // udot v19.4s, v27.16b, v31.16b\n" + ".inst 0x6e9f956a // udot v10.4s, v11.16b, v31.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x6e9f9736 // udot v22.4s, v25.16b, v31.16b\n" + ".inst 0x6e9a9733 // udot v19.4s, v25.16b, v26.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e86956a // udot v10.4s, v11.16b, v6.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e9a9571 // udot v17.4s, v11.16b, v26.16b\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v30.4s, v30.4s, v12.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "blt 27f\n" + "str s30, [x25, x23]\n" + "str s22, [x24, x23]\n" + "str s20, [x22, x23]\n" + "str s19, [x21, x23]\n" + "b 30f\n" + "27:" // Oddments: Unroll 2: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 28f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 29f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 29f\n" + "28:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset + "tbz x19, #0, 29f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "29:" // Oddments: Unroll 2: Oddment store: Bit 1: End + + "30:" // Oddments: Unroll 2: After oddment store + "add x23, x23, #0x4\n" + "subs x19, x19, #0x4\n" + "ble 34f\n" + "movi v15.4s, #0x0\n" + "ldr q8, [SP, #0x10]\n" + "movi v10.4s, #0x0\n" + "ldr q2, [SP, #0x30]\n" + "ldr q28, [SP, #0x50]\n" + ".inst 0x6e82956f // udot v15.4s, v11.16b, v2.16b\n" + "ldr q24, [SP, #0x70]\n" + "ldr q30, [%x[params], #0x0]\n" + "mov v22.16b, v30.16b\n" + "ldr q29, [%x[params], #0x10]\n" + "mov v20.16b, v30.16b\n" + "ldr q27, [%x[params], #0x20]\n" + "mov v19.16b, v30.16b\n" + "ldr q25, [%x[params], #0x30]\n" + ".inst 0x6e9c956f // udot v15.4s, v11.16b, v28.16b\n" + "ldr q23, [%x[params], #0x40]\n" + "ldr q21, [%x[params], #0x50]\n" + ".inst 0x6e8897be // udot v30.4s, v29.16b, v8.16b\n" + "add %x[params], %x[params], #0x60\n" + ".inst 0x6e8297b4 // udot v20.4s, v29.16b, v2.16b\n" + "mov v17.16b, v15.16b\n" + ".inst 0x6e88956f // udot v15.4s, v11.16b, v8.16b\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + ".inst 0x6e82977e // udot v30.4s, v27.16b, v2.16b\n" + ".inst 0x6e9c9774 // udot v20.4s, v27.16b, v28.16b\n" + "ext v8.16b, v8.16b, v8.16b, #0x1\n" + "ext v2.16b, v2.16b, v2.16b, #0x1\n" + ".inst 0x6e9c973e // udot v30.4s, v25.16b, v28.16b\n" + "mls v30.4s, v15.4s, v14.4s\n" + ".inst 0x6e989734 // udot v20.4s, v25.16b, v24.16b\n" + "ext v28.16b, v28.16b, v28.16b, #0x1\n" + "mls v20.4s, v17.4s, v14.4s\n" + "ext v24.16b, v24.16b, v24.16b, #0x1\n" + ".inst 0x6e8897b6 // udot v22.4s, v29.16b, v8.16b\n" + ".inst 0x6e8297b3 // udot v19.4s, v29.16b, v2.16b\n" + ".inst 0x6e82956a // udot v10.4s, v11.16b, v2.16b\n" + "sqrdmulh v30.4s, v30.4s, v23.4s\n" + ".inst 0x6e829776 // udot v22.4s, v27.16b, v2.16b\n" + ".inst 0x6e9c9773 // udot v19.4s, v27.16b, v28.16b\n" + ".inst 0x6e9c956a // udot v10.4s, v11.16b, v28.16b\n" + "and v18.16b, v30.16b, v21.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + ".inst 0x6e9c9736 // udot v22.4s, v25.16b, v28.16b\n" + ".inst 0x6e989733 // udot v19.4s, v25.16b, v24.16b\n" + "mov v17.16b, v10.16b\n" + ".inst 0x6e88956a // udot v10.4s, v11.16b, v8.16b\n" + "mls v22.4s, v10.4s, v14.4s\n" + ".inst 0x6e989571 // udot v17.4s, v11.16b, v24.16b\n" + "sqadd v30.4s, v30.4s, v18.4s\n" + "mls v19.4s, v17.4s, v14.4s\n" + "srshl v30.4s, v30.4s, v21.4s\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqrdmulh v22.4s, v22.4s, v23.4s\n" + "add v30.4s, v30.4s, v13.4s\n" + "and v16.16b, v20.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v30.4s, v30.4s, v9.4s\n" + "and v17.16b, v22.16b, v21.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v30.4s, v30.4s, v12.4s\n" + "sqrdmulh v19.4s, v19.4s, v23.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "and v16.16b, v19.16b, v21.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "srshl v20.4s, v20.4s, v21.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "srshl v22.4s, v22.4s, v21.4s\n" + "add v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v9.4s\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v21.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v22.4s, v22.4s, v9.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v22.4s, v22.4s, v12.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smax v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "smin v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "31:" // Oddments: Unroll 3: Oddment store + "add x25, x25, x23\n" + "add x24, x24, x23\n" + "add x22, x22, x23\n" + "add x21, x21, x23\n" + "tbz x19, #1, 32f\n" + "st1 { v30.h }[0], [x25], #0x2\n" + "st1 { v22.h }[0], [x24], #0x2\n" + "st1 { v20.h }[0], [x22], #0x2\n" + "st1 { v19.h }[0], [x21], #0x2\n" + "tbz x19, #0, 33f\n" + "st1 { v30.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v20.b }[2], [x22], #0x1\n" + "st1 { v19.b }[2], [x21], #0x1\n" + "b 33f\n" + "32:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset + "tbz x19, #0, 33f\n" + "st1 { v30.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v20.b }[0], [x22], #0x1\n" + "st1 { v19.b }[0], [x21], #0x1\n" + "33:" // Oddments: Unroll 3: Oddment store: Bit 1: End + + "34:" // End + "add SP, SP, #0x80\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..09ba75f685 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size; + + kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + + a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..14e113b776 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1192 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const uint8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x17, #0x0\n" + "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "mov x15, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x14, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x12, x8, #0x3\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v14.16b }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v9.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v15.4s }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v24.4s }, [x20]\n" + "ld1r { v12.4s }, [x19]\n" + "ldp x10, x9, [x21, #0x0]\n" + "ldp x28, x27, [x21, #0x10]\n" + "cbz x12, 3f\n" + "subs x12, x12, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q13, [x19, #0x0]\n" + "mov v17.16b, v13.16b\n" + "ldr q19, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v13.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v23.16b, v13.16b\n" + "ldr d0, [x16, #0x0]\n" + "usubl v0.8h, v0.8b, v9.8b\n" + "mov v25.16b, v19.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v21.16b, v19.16b\n" + "ldr d2, [x16, #0x10]\n" + "usubl v1.8h, v1.8b, v9.8b\n" + "mov v20.16b, v19.16b\n" + "ldr d3, [x16, #0x18]\n" + "ldr d4, [x16, #0x20]\n" + "usubl v2.8h, v2.8b, v9.8b\n" + "ldr d5, [x16, #0x28]\n" + "usubl v3.8h, v3.8b, v9.8b\n" + "ldr d6, [x16, #0x30]\n" + "ldr d7, [x16, #0x38]\n" + "usubl v4.8h, v4.8b, v9.8b\n" + "ldr d8, [x16, #0x40]\n" + "usubl v5.8h, v5.8b, v9.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "usubl v6.8h, v6.8b, v9.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "usubl v7.8h, v7.8b, v9.8b\n" + "usubl v8.8h, v8.8b, v9.8b\n" + "ldr x19, [x14, #0x20]\n" + "ldr d31, [x23, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "ldr d30, [x22, x17]\n" + "ldr d29, [x21, x17]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "ldr d28, [x20, x17]\n" + "ldr d27, [x19, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "usubl v27.8h, v27.8b, v14.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v13.4s, v31.4h, v4.4h\n" + "ldr x21, [x14, #0x28]\n" + "add x16, x16, #0x48\n" + "smlal2 v19.4s, v31.8h, v4.8h\n" + "ldr x20, [x14, #0x30]\n" + "subs x12, x12, #0x1\n" + "smlal v17.4s, v31.4h, v3.4h\n" + "ldr x26, [x14, #0x38]\n" + "smlal2 v25.4s, v31.8h, v3.8h\n" + "ldr x25, [x14, #0x40]\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "ldr x19, [x14, #0x48]\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "ldr x24, [x14, #0x50]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "ldr x23, [x14, #0x58]\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x22, [x14, #0x60]\n" + "smlal2 v19.4s, v30.8h, v0.8h\n" + "ldr d30, [x19, x17]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "ldr x21, [x14, #0x68]\n" + "smlal2 v25.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "ldr x20, [x14, #0x70]\n" + "smlal2 v19.4s, v28.8h, v5.8h\n" + "ldr x19, [x14, #0x78]\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "ldr q26, [x13, #0x0]\n" + "smlal2 v25.4s, v28.8h, v4.8h\n" + "ldr q10, [x11, #0x0]\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "ldr q11, [x13, #0x10]\n" + "add x13, x13, #0x20\n" + "smlal2 v21.4s, v28.8h, v2.8h\n" + "ldr q18, [x11, #0x10]\n" + "add x11, x11, #0x20\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x17]\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v21.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v19.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v27.4h, v6.4h\n" + "smlal2 v25.4s, v27.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal v23.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v19.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v25.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x17]\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" + "smlal2 v19.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v25.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v8.4h\n" + "smlal2 v19.4s, v30.8h, v8.8h\n" + "smlal v17.4s, v30.4h, v7.4h\n" + "smlal2 v25.4s, v30.8h, v7.8h\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v21.4s, v30.8h, v5.8h\n" + "smlal v23.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x17]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal v13.4s, v29.4h, v3.4h\n" + "smlal2 v19.4s, v29.8h, v3.8h\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v5.4h\n" + "smlal2 v25.4s, v28.8h, v5.8h\n" + "smlal v23.4s, v28.4h, v2.4h\n" + "smlal2 v20.4s, v28.8h, v2.8h\n" + "ldr d28, [x19, x17]\n" + "add x17, x17, #0x8\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v19.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v8.4h\n" + "smlal2 v25.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v5.4h\n" + "smlal2 v20.4s, v30.8h, v5.8h\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v21.4s, v29.8h, v7.8h\n" + "smlal v23.4s, v29.4h, v6.4h\n" + "smlal2 v20.4s, v29.8h, v6.8h\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v21.4s, v28.8h, v8.8h\n" + "smlal v23.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "sqrdmulh v13.4s, v13.4s, v26.4s\n" + "sqrdmulh v19.4s, v19.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v26.4s\n" + "sqrdmulh v25.4s, v25.4s, v11.4s\n" + "and v22.16b, v13.16b, v10.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v28.16b, v19.16b, v18.16b\n" + "and v3.16b, v17.16b, v10.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v6.16b, v25.16b, v18.16b\n" + "sqrdmulh v16.4s, v16.4s, v26.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v11.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqrdmulh v23.4s, v23.4s, v26.4s\n" + "and v0.16b, v16.16b, v10.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "and v29.16b, v21.16b, v18.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "add v13.4s, v13.4s, v15.4s\n" + "srshl v19.4s, v19.4s, v18.4s\n" + "srshl v17.4s, v17.4s, v10.4s\n" + "srshl v25.4s, v25.4s, v18.4s\n" + "smin v13.4s, v13.4s, v12.4s\n" + "add v19.4s, v19.4s, v15.4s\n" + "add v17.4s, v17.4s, v15.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smin v19.4s, v19.4s, v12.4s\n" + "smin v17.4s, v17.4s, v12.4s\n" + "add v25.4s, v25.4s, v15.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smin v25.4s, v25.4s, v12.4s\n" + "uzp1 v13.16b, v13.16b, v19.16b\n" + "sqadd v16.4s, v16.4s, v0.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "str d13, [x10, x15]\n" + "smax v25.4s, v25.4s, v24.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "srshl v16.4s, v16.4s, v10.4s\n" + "and v3.16b, v23.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v25.16b\n" + "add v16.4s, v16.4s, v15.4s\n" + "srshl v21.4s, v21.4s, v18.4s\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x9, x15]\n" + "smin v16.4s, v16.4s, v12.4s\n" + "sqrdmulh v20.4s, v20.4s, v11.4s\n" + "add v21.4s, v21.4s, v15.4s\n" + "sqadd v23.4s, v23.4s, v3.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smin v21.4s, v21.4s, v12.4s\n" + "and v25.16b, v20.16b, v18.16b\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smax v21.4s, v21.4s, v24.4s\n" + "srshl v23.4s, v23.4s, v10.4s\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v23.4s, v23.4s, v15.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x28, x15]\n" + "smin v23.4s, v23.4s, v12.4s\n" + "sqadd v20.4s, v20.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "srshl v20.4s, v20.4s, v18.4s\n" + "add v20.4s, v20.4s, v15.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d23, [x27, x15]\n" + "add x15, x15, #0x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q13, [x19, #0x0]\n" + "mov v17.16b, v13.16b\n" + "ldr q19, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v13.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v23.16b, v13.16b\n" + "ldr d0, [x16, #0x0]\n" + "usubl v0.8h, v0.8b, v9.8b\n" + "mov v25.16b, v19.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v21.16b, v19.16b\n" + "ldr d2, [x16, #0x10]\n" + "usubl v1.8h, v1.8b, v9.8b\n" + "mov v20.16b, v19.16b\n" + "ldr d3, [x16, #0x18]\n" + "ldr d4, [x16, #0x20]\n" + "usubl v2.8h, v2.8b, v9.8b\n" + "ldr d5, [x16, #0x28]\n" + "usubl v3.8h, v3.8b, v9.8b\n" + "ldr d6, [x16, #0x30]\n" + "ldr d7, [x16, #0x38]\n" + "usubl v4.8h, v4.8b, v9.8b\n" + "ldr d8, [x16, #0x40]\n" + "usubl v5.8h, v5.8b, v9.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "usubl v6.8h, v6.8b, v9.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "usubl v7.8h, v7.8b, v9.8b\n" + "usubl v8.8h, v8.8b, v9.8b\n" + "ldr x19, [x14, #0x20]\n" + "ldr d31, [x23, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "ldr d30, [x22, x17]\n" + "ldr d29, [x21, x17]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "ldr d28, [x20, x17]\n" + "ldr d27, [x19, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "usubl v27.8h, v27.8b, v14.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v13.4s, v31.4h, v4.4h\n" + "ldr x21, [x14, #0x28]\n" + "tst x8, #0x7\n" + "smlal2 v19.4s, v31.8h, v4.8h\n" + "ldr x20, [x14, #0x30]\n" + "smlal v17.4s, v31.4h, v3.4h\n" + "ldr x26, [x14, #0x38]\n" + "smlal2 v25.4s, v31.8h, v3.8h\n" + "ldr x25, [x14, #0x40]\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "ldr x19, [x14, #0x48]\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "ldr x24, [x14, #0x50]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "ldr x23, [x14, #0x58]\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x22, [x14, #0x60]\n" + "smlal2 v19.4s, v30.8h, v0.8h\n" + "ldr d30, [x19, x17]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "ldr x21, [x14, #0x68]\n" + "smlal2 v25.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "ldr x20, [x14, #0x70]\n" + "smlal2 v19.4s, v28.8h, v5.8h\n" + "ldr x19, [x14, #0x78]\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "ldr q26, [x13, #0x0]\n" + "smlal2 v25.4s, v28.8h, v4.8h\n" + "ldr q10, [x11, #0x0]\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "ldr q11, [x13, #0x10]\n" + "add x13, x13, #0x20\n" + "smlal2 v21.4s, v28.8h, v2.8h\n" + "ldr q18, [x11, #0x10]\n" + "add x11, x11, #0x20\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x17]\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v21.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v19.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v27.4h, v6.4h\n" + "smlal2 v25.4s, v27.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal v23.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v19.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v25.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x17]\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" + "smlal2 v19.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v25.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x17]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v8.4h\n" + "smlal2 v19.4s, v30.8h, v8.8h\n" + "smlal v17.4s, v30.4h, v7.4h\n" + "smlal2 v25.4s, v30.8h, v7.8h\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v21.4s, v30.8h, v5.8h\n" + "smlal v23.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x17]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal v13.4s, v29.4h, v3.4h\n" + "smlal2 v19.4s, v29.8h, v3.8h\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x17]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v5.4h\n" + "smlal2 v25.4s, v28.8h, v5.8h\n" + "smlal v23.4s, v28.4h, v2.4h\n" + "smlal2 v20.4s, v28.8h, v2.8h\n" + "ldr d28, [x19, x17]\n" + "add x17, x17, #0x8\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v19.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v8.4h\n" + "smlal2 v25.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v5.4h\n" + "smlal2 v20.4s, v30.8h, v5.8h\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v21.4s, v29.8h, v7.8h\n" + "smlal v23.4s, v29.4h, v6.4h\n" + "smlal2 v20.4s, v29.8h, v6.8h\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v21.4s, v28.8h, v8.8h\n" + "smlal v23.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "sqrdmulh v13.4s, v13.4s, v26.4s\n" + "sqrdmulh v19.4s, v19.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v26.4s\n" + "sqrdmulh v25.4s, v25.4s, v11.4s\n" + "and v22.16b, v13.16b, v10.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v28.16b, v19.16b, v18.16b\n" + "and v3.16b, v17.16b, v10.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v6.16b, v25.16b, v18.16b\n" + "sqrdmulh v16.4s, v16.4s, v26.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v11.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqrdmulh v23.4s, v23.4s, v26.4s\n" + "and v0.16b, v16.16b, v10.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "and v29.16b, v21.16b, v18.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "add v13.4s, v13.4s, v15.4s\n" + "srshl v19.4s, v19.4s, v18.4s\n" + "srshl v17.4s, v17.4s, v10.4s\n" + "srshl v25.4s, v25.4s, v18.4s\n" + "smin v13.4s, v13.4s, v12.4s\n" + "add v19.4s, v19.4s, v15.4s\n" + "add v17.4s, v17.4s, v15.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smin v19.4s, v19.4s, v12.4s\n" + "smin v17.4s, v17.4s, v12.4s\n" + "add v25.4s, v25.4s, v15.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "smin v25.4s, v25.4s, v12.4s\n" + "uzp1 v13.16b, v13.16b, v19.16b\n" + "sqadd v16.4s, v16.4s, v0.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "str d13, [x10, x15]\n" + "smax v25.4s, v25.4s, v24.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "srshl v16.4s, v16.4s, v10.4s\n" + "and v3.16b, v23.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v25.16b\n" + "add v16.4s, v16.4s, v15.4s\n" + "srshl v21.4s, v21.4s, v18.4s\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x9, x15]\n" + "smin v16.4s, v16.4s, v12.4s\n" + "sqrdmulh v20.4s, v20.4s, v11.4s\n" + "add v21.4s, v21.4s, v15.4s\n" + "sqadd v23.4s, v23.4s, v3.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "smin v21.4s, v21.4s, v12.4s\n" + "and v25.16b, v20.16b, v18.16b\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smax v21.4s, v21.4s, v24.4s\n" + "srshl v23.4s, v23.4s, v10.4s\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v23.4s, v23.4s, v15.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x28, x15]\n" + "smin v23.4s, v23.4s, v12.4s\n" + "sqadd v20.4s, v20.4s, v25.4s\n" + "smax v23.4s, v23.4s, v24.4s\n" + "srshl v20.4s, v20.4s, v18.4s\n" + "add v20.4s, v20.4s, v15.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d23, [x27, x15]\n" + "add x15, x15, #0x8\n" + "beq 64f\n" + "add x16, x16, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x8, #2, 5f\n" + "ld1 { v13.4s }, [x19], #0x10\n" + "tbz x8, #1, 4f\n" + "ld1 { v19.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v19.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v19.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x8, #1, 6f\n" + "ld1 { v13.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v13.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v17.16b, v13.16b\n" + "ldr d0, [x16, #0x0]\n" + "mov v25.16b, v19.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v16.16b, v13.16b\n" + "ldr d2, [x16, #0x10]\n" + "mov v21.16b, v19.16b\n" + "ldr d3, [x16, #0x18]\n" + "mov v23.16b, v13.16b\n" + "ldr d4, [x16, #0x20]\n" + "usubl v0.8h, v0.8b, v9.8b\n" + "mov v20.16b, v19.16b\n" + "ldr d5, [x16, #0x28]\n" + "usubl v1.8h, v1.8b, v9.8b\n" + "ldr d6, [x16, #0x30]\n" + "usubl v2.8h, v2.8b, v9.8b\n" + "ldr d7, [x16, #0x38]\n" + "usubl v3.8h, v3.8b, v9.8b\n" + "ldr d8, [x16, #0x40]\n" + "usubl v4.8h, v4.8b, v9.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "usubl v5.8h, v5.8b, v9.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "usubl v6.8h, v6.8b, v9.8b\n" + "usubl v7.8h, v7.8b, v9.8b\n" + "ldr x19, [x14, #0x20]\n" + "usubl v8.8h, v8.8b, v9.8b\n" + "add x23, x23, x17\n" + "add x22, x22, x17\n" + "add x21, x21, x17\n" + "add x20, x20, x17\n" + "add x19, x19, x17\n" + "tbz x8, #2, 9f\n" + "ld1 { v31.s }[0], [x23], #0x4\n" + "ld1 { v30.s }[0], [x22], #0x4\n" + "ld1 { v29.s }[0], [x21], #0x4\n" + "ld1 { v28.s }[0], [x20], #0x4\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 8f\n" + "ld1 { v31.h }[2], [x23], #0x2\n" + "ld1 { v30.h }[2], [x22], #0x2\n" + "ld1 { v29.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x20], #0x2\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[6], [x23]\n" + "ld1 { v30.b }[6], [x22]\n" + "ld1 { v29.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x20]\n" + "ld1 { v27.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[4], [x23]\n" + "ld1 { v30.b }[4], [x22]\n" + "ld1 { v29.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x20]\n" + "ld1 { v27.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x8, #1, 10f\n" + "ld1 { v31.h }[0], [x23], #0x2\n" + "ld1 { v30.h }[0], [x22], #0x2\n" + "ld1 { v29.h }[0], [x21], #0x2\n" + "ld1 { v28.h }[0], [x20], #0x2\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[2], [x23]\n" + "ld1 { v30.b }[2], [x22]\n" + "ld1 { v29.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x20]\n" + "ld1 { v27.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[0], [x23]\n" + "ld1 { v30.b }[0], [x22]\n" + "ld1 { v29.b }[0], [x21]\n" + "ld1 { v28.b }[0], [x20]\n" + "ld1 { v27.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ldr x21, [x14, #0x28]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v4.4h\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal2 v19.4s, v31.8h, v4.8h\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v17.4s, v31.4h, v3.4h\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v25.4s, v31.8h, v3.8h\n" + "usubl v27.8h, v27.8b, v14.8b\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "add x21, x21, x17\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "smlal2 v19.4s, v30.8h, v0.8h\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v25.4s, v29.8h, v2.8h\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "smlal2 v19.4s, v28.8h, v5.8h\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal2 v25.4s, v28.8h, v4.8h\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "smlal2 v21.4s, v28.8h, v2.8h\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v31.s }[0], [x21], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v31.h }[2], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[6], [x21]\n" + "b 15f\n" + "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[4], [x21]\n" + "b 15f\n" + "13:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x8, #1, 14f\n" + "ld1 { v31.h }[0], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[2], [x21]\n" + "b 15f\n" + "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[0], [x21]\n" + "15:" // Oddments: Load (3, 0): Bit 2: End + "smlal v13.4s, v27.4h, v7.4h\n" + "ldr x20, [x14, #0x30]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v19.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v27.4h, v6.4h\n" + "add x20, x20, x17\n" + "smlal2 v25.4s, v27.8h, v6.8h\n" + "smlal v23.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v21.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "tbz x8, #2, 17f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 19f\n" + "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 19f\n" + "17:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x8, #1, 18f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 19f\n" + "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[0], [x20]\n" + "19:" // Oddments: Load (3, 3): Bit 2: End + "ldr x26, [x14, #0x38]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "add x26, x26, x17\n" + "tbz x8, #2, 21f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x8, #1, 20f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[6], [x26]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[4], [x26]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 1): Bit 2: Unset + "tbz x8, #1, 22f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[2], [x26]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[0], [x26]\n" + "23:" // Oddments: Load (0, 1): Bit 2: End + "ldr x25, [x14, #0x40]\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v19.4s, v28.8h, v1.8h\n" + "add x25, x25, x17\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v25.4s, v28.8h, v0.8h\n" + "tbz x8, #2, 25f\n" + "ld1 { v31.s }[0], [x25], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v31.h }[2], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[6], [x25]\n" + "b 27f\n" + "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[4], [x25]\n" + "b 27f\n" + "25:" // Oddments: Load (0, 2): Bit 2: Unset + "tbz x8, #1, 26f\n" + "ld1 { v31.h }[0], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[2], [x25]\n" + "b 27f\n" + "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[0], [x25]\n" + "27:" // Oddments: Load (0, 2): Bit 2: End + "ldr x19, [x14, #0x48]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" + "smlal2 v19.4s, v31.8h, v2.8h\n" + "add x19, x19, x17\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v25.4s, v31.8h, v1.8h\n" + "tbz x8, #2, 29f\n" + "ld1 { v30.s }[0], [x19], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v30.h }[2], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[6], [x19]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[4], [x19]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x8, #1, 30f\n" + "ld1 { v30.h }[0], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[2], [x19]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[0], [x19]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "ldr x24, [x14, #0x50]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal v13.4s, v30.4h, v8.4h\n" + "smlal2 v19.4s, v30.8h, v8.8h\n" + "add x24, x24, x17\n" + "smlal v17.4s, v30.4h, v7.4h\n" + "smlal2 v25.4s, v30.8h, v7.8h\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v21.4s, v30.8h, v5.8h\n" + "smlal v23.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 35f\n" + "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 35f\n" + "33:" // Oddments: Load (1, 0): Bit 2: Unset + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 35f\n" + "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x24]\n" + "35:" // Oddments: Load (1, 0): Bit 2: End + "ldr x23, [x14, #0x58]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v13.4s, v29.4h, v3.4h\n" + "smlal2 v19.4s, v29.8h, v3.8h\n" + "add x23, x23, x17\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "tbz x8, #2, 37f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[6], [x23]\n" + "b 39f\n" + "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[4], [x23]\n" + "b 39f\n" + "37:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x8, #1, 38f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[2], [x23]\n" + "b 39f\n" + "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[0], [x23]\n" + "39:" // Oddments: Load (1, 3): Bit 2: End + "ldr x22, [x14, #0x60]\n" + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v5.4h\n" + "smlal2 v25.4s, v28.8h, v5.8h\n" + "add x22, x22, x17\n" + "smlal v23.4s, v28.4h, v2.4h\n" + "smlal2 v20.4s, v28.8h, v2.8h\n" + "tbz x8, #2, 41f\n" + "ld1 { v31.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v31.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x8, #1, 42f\n" + "ld1 { v31.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 0): Bit 2: End + "ldr x21, [x14, #0x68]\n" + "usubl v31.8h, v31.8b, v14.8b\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal2 v19.4s, v31.8h, v6.8h\n" + "add x21, x21, x17\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "tbz x8, #2, 45f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x8, #1, 46f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[0], [x21]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "ldr x20, [x14, #0x70]\n" + "usubl v30.8h, v30.8b, v14.8b\n" + "smlal v17.4s, v30.4h, v8.4h\n" + "smlal2 v25.4s, v30.8h, v8.8h\n" + "add x20, x20, x17\n" + "smlal v23.4s, v30.4h, v5.4h\n" + "smlal2 v20.4s, v30.8h, v5.8h\n" + "tbz x8, #2, 49f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x8, #1, 50f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[0], [x20]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "ldr x19, [x14, #0x78]\n" + "usubl v29.8h, v29.8b, v14.8b\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v21.4s, v29.8h, v7.8h\n" + "add x19, x19, x17\n" + "smlal v23.4s, v29.4h, v6.4h\n" + "smlal2 v20.4s, v29.8h, v6.8h\n" + "tbz x8, #2, 53f\n" + "ld1 { v28.s }[0], [x19], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v28.h }[2], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[6], [x19]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[4], [x19]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x8, #1, 54f\n" + "ld1 { v28.h }[0], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[2], [x19]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[0], [x19]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "usubl v28.8h, v28.8b, v14.8b\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v21.4s, v28.8h, v8.8h\n" + "smlal v23.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "tbz x8, #2, 57f\n" + "ld1 { v26.4s }, [x13], #0x10\n" + "ld1 { v10.4s }, [x11], #0x10\n" + "tbz x8, #1, 56f\n" + "ld1 { v11.d }[0], [x13], #0x8\n" + "ld1 { v18.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v11.s }[2], [x13]\n" + "ld1 { v18.s }[2], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v11.s }[0], [x13]\n" + "ld1 { v18.s }[0], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load requant params: Bit 2: Unset + "tbz x8, #1, 58f\n" + "ld1 { v26.d }[0], [x13], #0x8\n" + "ld1 { v10.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v26.s }[2], [x13]\n" + "ld1 { v10.s }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v26.s }[0], [x13]\n" + "ld1 { v10.s }[0], [x11]\n" + "59:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v13.4s, v13.4s, v26.4s\n" + "add x10, x10, x15\n" + "sqrdmulh v19.4s, v19.4s, v11.4s\n" + "add x9, x9, x15\n" + "sqrdmulh v17.4s, v17.4s, v26.4s\n" + "add x28, x28, x15\n" + "sqrdmulh v25.4s, v25.4s, v11.4s\n" + "add x27, x27, x15\n" + "sqrdmulh v16.4s, v16.4s, v26.4s\n" + "and v22.16b, v13.16b, v10.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v28.16b, v19.16b, v18.16b\n" + "and v3.16b, v17.16b, v10.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v6.16b, v25.16b, v18.16b\n" + "and v0.16b, v16.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v11.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v22.4s\n" + "sqrdmulh v23.4s, v23.4s, v26.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v11.4s\n" + "sqadd v19.4s, v19.4s, v28.4s\n" + "sqadd v17.4s, v17.4s, v3.4s\n" + "srshl v13.4s, v13.4s, v10.4s\n" + "sqadd v25.4s, v25.4s, v6.4s\n" + "srshl v19.4s, v19.4s, v18.4s\n" + "srshl v17.4s, v17.4s, v10.4s\n" + "add v13.4s, v13.4s, v15.4s\n" + "srshl v25.4s, v25.4s, v18.4s\n" + "add v19.4s, v19.4s, v15.4s\n" + "smin v13.4s, v13.4s, v12.4s\n" + "add v17.4s, v17.4s, v15.4s\n" + "smin v19.4s, v19.4s, v12.4s\n" + "smax v13.4s, v13.4s, v24.4s\n" + "smin v17.4s, v17.4s, v12.4s\n" + "smax v19.4s, v19.4s, v24.4s\n" + "add v25.4s, v25.4s, v15.4s\n" + "smax v17.4s, v17.4s, v24.4s\n" + "uzp1 v13.16b, v13.16b, v19.16b\n" + "smin v25.4s, v25.4s, v12.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "sqadd v16.4s, v16.4s, v0.4s\n" + "smax v25.4s, v25.4s, v24.4s\n" + "and v29.16b, v21.16b, v18.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v25.16b\n" + "srshl v16.4s, v16.4s, v10.4s\n" + "and v3.16b, v23.16b, v10.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "add v16.4s, v16.4s, v15.4s\n" + "sqadd v21.4s, v21.4s, v29.4s\n" + "and v25.16b, v20.16b, v18.16b\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smin v16.4s, v16.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v18.4s\n" + "sqadd v23.4s, v23.4s, v3.4s\n" + "smax v16.4s, v16.4s, v24.4s\n" + "add v21.4s, v21.4s, v15.4s\n" + "srshl v23.4s, v23.4s, v10.4s\n" + "sqadd v20.4s, v20.4s, v25.4s\n" + "smin v21.4s, v21.4s, v12.4s\n" + "add v23.4s, v23.4s, v15.4s\n" + "srshl v20.4s, v20.4s, v18.4s\n" + "smax v21.4s, v21.4s, v24.4s\n" + "smin v23.4s, v23.4s, v12.4s\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v20.4s, v20.4s, v15.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "smax v23.4s, v23.4s, v24.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v20.4s, v20.4s, v24.4s\n" + "uzp1 v23.16b, v23.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "tbz x8, #2, 61f\n" + "st1 { v13.s }[0], [x10], #0x4\n" + "st1 { v17.s }[0], [x9], #0x4\n" + "st1 { v16.s }[0], [x28], #0x4\n" + "st1 { v23.s }[0], [x27], #0x4\n" + "tbz x8, #1, 60f\n" + "st1 { v13.h }[2], [x10], #0x2\n" + "st1 { v17.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v23.h }[2], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v13.b }[6], [x10], #0x1\n" + "st1 { v17.b }[6], [x9], #0x1\n" + "st1 { v16.b }[6], [x28], #0x1\n" + "st1 { v23.b }[6], [x27], #0x1\n" + "b 63f\n" + "60:" // Oddments: Bit 2: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v13.b }[4], [x10], #0x1\n" + "st1 { v17.b }[4], [x9], #0x1\n" + "st1 { v16.b }[4], [x28], #0x1\n" + "st1 { v23.b }[4], [x27], #0x1\n" + "b 63f\n" + "61:" // Oddments: Bit 2: Unset + "tbz x8, #1, 62f\n" + "st1 { v13.h }[0], [x10], #0x2\n" + "st1 { v17.h }[0], [x9], #0x2\n" + "st1 { v16.h }[0], [x28], #0x2\n" + "st1 { v23.h }[0], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v13.b }[2], [x10], #0x1\n" + "st1 { v17.b }[2], [x9], #0x1\n" + "st1 { v16.b }[2], [x28], #0x1\n" + "st1 { v23.b }[2], [x27], #0x1\n" + "b 63f\n" + "62:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v13.b }[0], [x10], #0x1\n" + "st1 { v17.b }[0], [x9], #0x1\n" + "st1 { v16.b }[0], [x28], #0x1\n" + "st1 { v23.b }[0], [x27], #0x1\n" + "63:" // Oddments: Bit 2: End + + "64:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..44817dbccf --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size; + + kern_type kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + + a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..ccdde41973 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const uint8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x5, #0x0\n" + "ldr x6, [%x[params], %[offsetof_Params_weights]]\n" + "mov x7, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x8, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x16, x4, #0x3\n" + "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v12.16b }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v13.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v11.4s }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v19.4s }, [x20]\n" + "ld1r { v14.4s }, [x19]\n" + "ldp x14, x13, [x21, #0x0]\n" + "ldp x12, x11, [x21, #0x10]\n" + "cbz x16, 3f\n" + "subs x16, x16, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "mov v20.16b, v15.16b\n" + "ldr q10, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v15.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v17.16b, v15.16b\n" + "ldr d0, [x6, #0x0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "mov v23.16b, v10.16b\n" + "ldr d1, [x6, #0x8]\n" + "mov v22.16b, v10.16b\n" + "ldr d2, [x6, #0x10]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "mov v18.16b, v10.16b\n" + "ldr d3, [x6, #0x18]\n" + "ldr d4, [x6, #0x20]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "ldr d5, [x6, #0x28]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "ldr d6, [x6, #0x30]\n" + "ldr d7, [x6, #0x38]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "ldr d8, [x6, #0x40]\n" + "usubl v5.8h, v5.8b, v13.8b\n" + "ldp x26, x25, [x8, #0x0]\n" + "usubl v6.8h, v6.8b, v13.8b\n" + "ldp x24, x23, [x8, #0x10]\n" + "usubl v7.8h, v7.8b, v13.8b\n" + "usubl v8.8h, v8.8b, v13.8b\n" + "ldp x22, x21, [x8, #0x20]\n" + "ldp x20, x19, [x8, #0x30]\n" + "ldr d31, [x26, x5]\n" + "usubl v31.8h, v31.8b, v12.8b\n" + "ldr d30, [x25, x5]\n" + "ldr d29, [x24, x5]\n" + "usubl v30.8h, v30.8b, v12.8b\n" + "ldr d28, [x23, x5]\n" + "ldr d27, [x22, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "ldr d26, [x21, x5]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "ldr d25, [x20, x5]\n" + "ldr d24, [x19, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v15.4s, v31.4h, v8.4h\n" + "ldr x23, [x8, #0x40]\n" + "add x6, x6, #0x48\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ldr x22, [x8, #0x48]\n" + "subs x16, x16, #0x1\n" + "smlal v20.4s, v31.4h, v6.4h\n" + "ldr x21, [x8, #0x50]\n" + "smlal2 v23.4s, v31.8h, v6.8h\n" + "ldr x20, [x8, #0x58]\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ldr x19, [x8, #0x60]\n" + "smlal2 v22.4s, v31.8h, v2.8h\n" + "ldr x10, [x8, #0x68]\n" + "smlal v17.4s, v31.4h, v0.4h\n" + "ldr x9, [x8, #0x70]\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr x28, [x8, #0x78]\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "ldr x27, [x8, #0x80]\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x26, [x8, #0x88]\n" + "smlal v20.4s, v28.4h, v1.4h\n" + "ldr x25, [x8, #0x90]\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x5]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "ldr x24, [x8, #0x98]\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "ldr d29, [x23, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v2.4h\n" + "ldr x23, [x8, #0xa0]\n" + "smlal2 v23.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "ldr x22, [x8, #0xa8]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "ldr d26, [x20, x5]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "ldr x21, [x8, #0xb0]\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "ldr d25, [x19, x5]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "ldr x20, [x8, #0xb8]\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "ldr x19, [x8, #0xc0]\n" + "smlal v20.4s, v24.4h, v0.4h\n" + "ldr q21, [x17, #0x0]\n" + "smlal2 v23.4s, v24.8h, v0.8h\n" + "ldr d24, [x9, x5]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v4.4h\n" + "ldr q30, [x15, #0x0]\n" + "smlal2 v23.4s, v29.8h, v4.8h\n" + "ldr d29, [x10, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v5.4h\n" + "ldr q31, [x17, #0x10]\n" + "smlal2 v23.4s, v28.8h, v5.8h\n" + "ldr d28, [x27, x5]\n" + "add x17, x17, #0x20\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "ldr q9, [x15, #0x10]\n" + "add x15, x15, #0x20\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "ldr d27, [x28, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v26.4h, v3.4h\n" + "smlal2 v22.4s, v26.8h, v3.8h\n" + "ldr d26, [x26, x5]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v22.4s, v25.8h, v0.8h\n" + "ldr d25, [x25, x5]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v29.4h, v4.4h\n" + "smlal2 v22.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v22.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x5]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "smlal v17.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "smlal v16.4s, v25.4h, v6.4h\n" + "smlal2 v22.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x5]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "ldr d26, [x21, x5]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "ldr d29, [x19, x5]\n" + "add x5, x5, #0x8\n" + "smlal v16.4s, v27.4h, v7.4h\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v22.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v16.4s, v24.4h, v5.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "sqrdmulh v15.4s, v15.4s, v21.4s\n" + "smlal2 v22.4s, v24.8h, v5.8h\n" + "smlal v17.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "smlal v16.4s, v25.4h, v8.4h\n" + "smlal2 v22.4s, v25.8h, v8.8h\n" + "smlal v17.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "and v26.16b, v15.16b, v30.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "smlal v17.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "sqrdmulh v10.4s, v10.4s, v31.4s\n" + "sqrdmulh v20.4s, v20.4s, v21.4s\n" + "sqrdmulh v23.4s, v23.4s, v31.4s\n" + "sqrdmulh v16.4s, v16.4s, v21.4s\n" + "sqadd v15.4s, v15.4s, v26.4s\n" + "and v8.16b, v10.16b, v9.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v30.4s\n" + "and v4.16b, v20.16b, v30.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v2.16b, v23.16b, v9.16b\n" + "and v1.16b, v16.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "add v15.4s, v15.4s, v11.4s\n" + "sqadd v10.4s, v10.4s, v8.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v31.4s\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "smin v15.4s, v15.4s, v14.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "smax v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v30.4s\n" + "add v10.4s, v10.4s, v11.4s\n" + "srshl v23.4s, v23.4s, v9.4s\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "add v20.4s, v20.4s, v11.4s\n" + "add v23.4s, v23.4s, v11.4s\n" + "smax v10.4s, v10.4s, v19.4s\n" + "smin v20.4s, v20.4s, v14.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "uzp1 v15.16b, v15.16b, v10.16b\n" + "smax v20.4s, v20.4s, v19.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x14, x7]\n" + "smax v23.4s, v23.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v30.4s\n" + "and v24.16b, v22.16b, v9.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v23.16b\n" + "add v16.4s, v16.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v21.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str d20, [x13, x7]\n" + "smin v16.4s, v16.4s, v14.4s\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v2.16b, v17.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "smax v16.4s, v16.4s, v19.4s\n" + "srshl v22.4s, v22.4s, v9.4s\n" + "and v31.16b, v18.16b, v9.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "add v22.4s, v22.4s, v11.4s\n" + "sqadd v17.4s, v17.4s, v2.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v31.4s\n" + "smax v22.4s, v22.4s, v19.4s\n" + "uzp1 v16.16b, v16.16b, v22.16b\n" + "add v17.4s, v17.4s, v11.4s\n" + "srshl v18.4s, v18.4s, v9.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x12, x7]\n" + "smin v17.4s, v17.4s, v14.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "smax v17.4s, v17.4s, v19.4s\n" + "smin v18.4s, v18.4s, v14.4s\n" + "smax v18.4s, v18.4s, v19.4s\n" + "uzp1 v17.16b, v17.16b, v18.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x11, x7]\n" + "add x7, x7, #0x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "mov v20.16b, v15.16b\n" + "ldr q10, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v16.16b, v15.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v17.16b, v15.16b\n" + "ldr d0, [x6, #0x0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "mov v23.16b, v10.16b\n" + "ldr d1, [x6, #0x8]\n" + "mov v22.16b, v10.16b\n" + "ldr d2, [x6, #0x10]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "mov v18.16b, v10.16b\n" + "ldr d3, [x6, #0x18]\n" + "ldr d4, [x6, #0x20]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "ldr d5, [x6, #0x28]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "ldr d6, [x6, #0x30]\n" + "ldr d7, [x6, #0x38]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "ldr d8, [x6, #0x40]\n" + "usubl v5.8h, v5.8b, v13.8b\n" + "ldp x26, x25, [x8, #0x0]\n" + "usubl v6.8h, v6.8b, v13.8b\n" + "ldp x24, x23, [x8, #0x10]\n" + "usubl v7.8h, v7.8b, v13.8b\n" + "usubl v8.8h, v8.8b, v13.8b\n" + "ldp x22, x21, [x8, #0x20]\n" + "ldp x20, x19, [x8, #0x30]\n" + "ldr d31, [x26, x5]\n" + "usubl v31.8h, v31.8b, v12.8b\n" + "ldr d30, [x25, x5]\n" + "ldr d29, [x24, x5]\n" + "usubl v30.8h, v30.8b, v12.8b\n" + "ldr d28, [x23, x5]\n" + "ldr d27, [x22, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "ldr d26, [x21, x5]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "ldr d25, [x20, x5]\n" + "ldr d24, [x19, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v15.4s, v31.4h, v8.4h\n" + "ldr x23, [x8, #0x40]\n" + "tst x4, #0x7\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ldr x22, [x8, #0x48]\n" + "smlal v20.4s, v31.4h, v6.4h\n" + "ldr x21, [x8, #0x50]\n" + "smlal2 v23.4s, v31.8h, v6.8h\n" + "ldr x20, [x8, #0x58]\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ldr x19, [x8, #0x60]\n" + "smlal2 v22.4s, v31.8h, v2.8h\n" + "ldr x10, [x8, #0x68]\n" + "smlal v17.4s, v31.4h, v0.4h\n" + "ldr x9, [x8, #0x70]\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr x28, [x8, #0x78]\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "ldr x27, [x8, #0x80]\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x26, [x8, #0x88]\n" + "smlal v20.4s, v28.4h, v1.4h\n" + "ldr x25, [x8, #0x90]\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x5]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "ldr x24, [x8, #0x98]\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "ldr d29, [x23, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v2.4h\n" + "ldr x23, [x8, #0xa0]\n" + "smlal2 v23.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "ldr x22, [x8, #0xa8]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "ldr d26, [x20, x5]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "ldr x21, [x8, #0xb0]\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "ldr d25, [x19, x5]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "ldr x20, [x8, #0xb8]\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "ldr x19, [x8, #0xc0]\n" + "smlal v20.4s, v24.4h, v0.4h\n" + "ldr q21, [x17, #0x0]\n" + "smlal2 v23.4s, v24.8h, v0.8h\n" + "ldr d24, [x9, x5]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v4.4h\n" + "ldr q30, [x15, #0x0]\n" + "smlal2 v23.4s, v29.8h, v4.8h\n" + "ldr d29, [x10, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v5.4h\n" + "ldr q31, [x17, #0x10]\n" + "smlal2 v23.4s, v28.8h, v5.8h\n" + "ldr d28, [x27, x5]\n" + "add x17, x17, #0x20\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "ldr q9, [x15, #0x10]\n" + "add x15, x15, #0x20\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "ldr d27, [x28, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v26.4h, v3.4h\n" + "smlal2 v22.4s, v26.8h, v3.8h\n" + "ldr d26, [x26, x5]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v22.4s, v25.8h, v0.8h\n" + "ldr d25, [x25, x5]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v29.4h, v4.4h\n" + "smlal2 v22.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x5]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v22.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x5]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x5]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "smlal v17.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "smlal v16.4s, v25.4h, v6.4h\n" + "smlal2 v22.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x5]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "ldr d26, [x21, x5]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "ldr d29, [x19, x5]\n" + "add x5, x5, #0x8\n" + "smlal v16.4s, v27.4h, v7.4h\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v22.4s, v27.8h, v7.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v16.4s, v24.4h, v5.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "sqrdmulh v15.4s, v15.4s, v21.4s\n" + "smlal2 v22.4s, v24.8h, v5.8h\n" + "smlal v17.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "smlal v16.4s, v25.4h, v8.4h\n" + "smlal2 v22.4s, v25.8h, v8.8h\n" + "smlal v17.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "and v26.16b, v15.16b, v30.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "smlal v17.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "sqrdmulh v10.4s, v10.4s, v31.4s\n" + "sqrdmulh v20.4s, v20.4s, v21.4s\n" + "sqrdmulh v23.4s, v23.4s, v31.4s\n" + "sqrdmulh v16.4s, v16.4s, v21.4s\n" + "sqadd v15.4s, v15.4s, v26.4s\n" + "and v8.16b, v10.16b, v9.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v30.4s\n" + "and v4.16b, v20.16b, v30.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v2.16b, v23.16b, v9.16b\n" + "and v1.16b, v16.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "add v15.4s, v15.4s, v11.4s\n" + "sqadd v10.4s, v10.4s, v8.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v31.4s\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "smin v15.4s, v15.4s, v14.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "smax v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v30.4s\n" + "add v10.4s, v10.4s, v11.4s\n" + "srshl v23.4s, v23.4s, v9.4s\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "add v20.4s, v20.4s, v11.4s\n" + "add v23.4s, v23.4s, v11.4s\n" + "smax v10.4s, v10.4s, v19.4s\n" + "smin v20.4s, v20.4s, v14.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "uzp1 v15.16b, v15.16b, v10.16b\n" + "smax v20.4s, v20.4s, v19.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x14, x7]\n" + "smax v23.4s, v23.4s, v19.4s\n" + "srshl v16.4s, v16.4s, v30.4s\n" + "and v24.16b, v22.16b, v9.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v23.16b\n" + "add v16.4s, v16.4s, v11.4s\n" + "sqrdmulh v17.4s, v17.4s, v21.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str d20, [x13, x7]\n" + "smin v16.4s, v16.4s, v14.4s\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v2.16b, v17.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "smax v16.4s, v16.4s, v19.4s\n" + "srshl v22.4s, v22.4s, v9.4s\n" + "and v31.16b, v18.16b, v9.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "add v22.4s, v22.4s, v11.4s\n" + "sqadd v17.4s, v17.4s, v2.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v31.4s\n" + "smax v22.4s, v22.4s, v19.4s\n" + "uzp1 v16.16b, v16.16b, v22.16b\n" + "add v17.4s, v17.4s, v11.4s\n" + "srshl v18.4s, v18.4s, v9.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x12, x7]\n" + "smin v17.4s, v17.4s, v14.4s\n" + "add v18.4s, v18.4s, v11.4s\n" + "smax v17.4s, v17.4s, v19.4s\n" + "smin v18.4s, v18.4s, v14.4s\n" + "smax v18.4s, v18.4s, v19.4s\n" + "uzp1 v17.16b, v17.16b, v18.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d17, [x11, x7]\n" + "add x7, x7, #0x8\n" + "beq 88f\n" + "add x6, x6, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x4, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x4, #1, 4f\n" + "ld1 { v10.d }[0], [x19], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v10.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x4, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v20.16b, v15.16b\n" + "ldr d0, [x6, #0x0]\n" + "mov v23.16b, v10.16b\n" + "ldr d1, [x6, #0x8]\n" + "mov v16.16b, v15.16b\n" + "ldr d2, [x6, #0x10]\n" + "mov v22.16b, v10.16b\n" + "ldr d3, [x6, #0x18]\n" + "mov v17.16b, v15.16b\n" + "ldr d4, [x6, #0x20]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "mov v18.16b, v10.16b\n" + "ldr d5, [x6, #0x28]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "ldr d6, [x6, #0x30]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "ldr d7, [x6, #0x38]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "ldr d8, [x6, #0x40]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "ldp x26, x25, [x8, #0x0]\n" + "usubl v5.8h, v5.8b, v13.8b\n" + "ldp x24, x23, [x8, #0x10]\n" + "usubl v6.8h, v6.8b, v13.8b\n" + "usubl v7.8h, v7.8b, v13.8b\n" + "ldp x22, x21, [x8, #0x20]\n" + "usubl v8.8h, v8.8b, v13.8b\n" + "ldp x20, x19, [x8, #0x30]\n" + "add x26, x26, x5\n" + "add x25, x25, x5\n" + "add x24, x24, x5\n" + "add x23, x23, x5\n" + "add x22, x22, x5\n" + "add x21, x21, x5\n" + "add x20, x20, x5\n" + "add x19, x19, x5\n" + "tbz x4, #2, 9f\n" + "ld1 { v31.s }[0], [x26], #0x4\n" + "ld1 { v30.s }[0], [x25], #0x4\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x4, #1, 8f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v30.h }[2], [x25], #0x2\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v30.b }[6], [x25]\n" + "ld1 { v29.b }[6], [x24]\n" + "ld1 { v28.b }[6], [x23]\n" + "ld1 { v27.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v30.b }[4], [x25]\n" + "ld1 { v29.b }[4], [x24]\n" + "ld1 { v28.b }[4], [x23]\n" + "ld1 { v27.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "ld1 { v24.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x4, #1, 10f\n" + "ld1 { v31.h }[0], [x26], #0x2\n" + "ld1 { v30.h }[0], [x25], #0x2\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v30.b }[2], [x25]\n" + "ld1 { v29.b }[2], [x24]\n" + "ld1 { v28.b }[2], [x23]\n" + "ld1 { v27.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[0], [x26]\n" + "ld1 { v30.b }[0], [x25]\n" + "ld1 { v29.b }[0], [x24]\n" + "ld1 { v28.b }[0], [x23]\n" + "ld1 { v27.b }[0], [x22]\n" + "ld1 { v26.b }[0], [x21]\n" + "ld1 { v25.b }[0], [x20]\n" + "ld1 { v24.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ldr x23, [x8, #0x40]\n" + "usubl v31.8h, v31.8b, v12.8b\n" + "smlal v15.4s, v31.4h, v8.4h\n" + "usubl v30.8h, v30.8b, v12.8b\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v31.4h, v6.4h\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal2 v23.4s, v31.8h, v6.8h\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal2 v22.4s, v31.8h, v2.8h\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v17.4s, v31.4h, v0.4h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "add x23, x23, x5\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "smlal v20.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "smlal v20.4s, v27.4h, v2.4h\n" + "smlal2 v23.4s, v27.8h, v2.8h\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "smlal v20.4s, v24.4h, v0.4h\n" + "smlal2 v23.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "tbz x4, #2, 13f\n" + "ld1 { v29.s }[0], [x23], #0x4\n" + "tbz x4, #1, 12f\n" + "ld1 { v29.h }[2], [x23], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[6], [x23]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[4], [x23]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x4, #1, 14f\n" + "ld1 { v29.h }[0], [x23], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[2], [x23]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v29.b }[0], [x23]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "ldr x22, [x8, #0x48]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v4.4h\n" + "smlal2 v23.4s, v29.8h, v4.8h\n" + "add x22, x22, x5\n" + "tbz x4, #2, 17f\n" + "ld1 { v28.s }[0], [x22], #0x4\n" + "tbz x4, #1, 16f\n" + "ld1 { v28.h }[2], [x22], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[6], [x22]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[4], [x22]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x4, #1, 18f\n" + "ld1 { v28.h }[0], [x22], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[2], [x22]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v28.b }[0], [x22]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "ldr x21, [x8, #0x50]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v5.4h\n" + "smlal2 v23.4s, v28.8h, v5.8h\n" + "add x21, x21, x5\n" + "tbz x4, #2, 21f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x4, #1, 20f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[6], [x21]\n" + "b 23f\n" + "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[4], [x21]\n" + "b 23f\n" + "21:" // Oddments: Load (1, 2): Bit 2: Unset + "tbz x4, #1, 22f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[2], [x21]\n" + "b 23f\n" + "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[0], [x21]\n" + "23:" // Oddments: Load (1, 2): Bit 2: End + "ldr x20, [x8, #0x58]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "add x20, x20, x5\n" + "smlal v20.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "tbz x4, #2, 25f\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "tbz x4, #1, 24f\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[6], [x20]\n" + "b 27f\n" + "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[4], [x20]\n" + "b 27f\n" + "25:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x4, #1, 26f\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[2], [x20]\n" + "b 27f\n" + "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v26.b }[0], [x20]\n" + "27:" // Oddments: Load (3, 0): Bit 2: End + "ldr x19, [x8, #0x60]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v16.4s, v26.4h, v3.4h\n" + "smlal2 v22.4s, v26.8h, v3.8h\n" + "add x19, x19, x5\n" + "tbz x4, #2, 29f\n" + "ld1 { v25.s }[0], [x19], #0x4\n" + "tbz x4, #1, 28f\n" + "ld1 { v25.h }[2], [x19], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[6], [x19]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[4], [x19]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x4, #1, 30f\n" + "ld1 { v25.h }[0], [x19], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[2], [x19]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v25.b }[0], [x19]\n" + "31:" // Oddments: Load (2, 0): Bit 2: End + "ldr x10, [x8, #0x68]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "add x10, x10, x5\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v22.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 33f\n" + "ld1 { v29.s }[0], [x10], #0x4\n" + "tbz x4, #1, 32f\n" + "ld1 { v29.h }[2], [x10], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[6], [x10]\n" + "b 35f\n" + "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[4], [x10]\n" + "b 35f\n" + "33:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x4, #1, 34f\n" + "ld1 { v29.h }[0], [x10], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[2], [x10]\n" + "b 35f\n" + "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v29.b }[0], [x10]\n" + "35:" // Oddments: Load (3, 1): Bit 2: End + "ldr x9, [x8, #0x70]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v16.4s, v29.4h, v4.4h\n" + "smlal2 v22.4s, v29.8h, v4.8h\n" + "add x9, x9, x5\n" + "tbz x4, #2, 37f\n" + "ld1 { v24.s }[0], [x9], #0x4\n" + "tbz x4, #1, 36f\n" + "ld1 { v24.h }[2], [x9], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[6], [x9]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[4], [x9]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x4, #1, 38f\n" + "ld1 { v24.h }[0], [x9], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[2], [x9]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v24.b }[0], [x9]\n" + "39:" // Oddments: Load (2, 1): Bit 2: End + "ldr x28, [x8, #0x78]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "add x28, x28, x5\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v22.4s, v24.8h, v1.8h\n" + "tbz x4, #2, 41f\n" + "ld1 { v27.s }[0], [x28], #0x4\n" + "tbz x4, #1, 40f\n" + "ld1 { v27.h }[2], [x28], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[6], [x28]\n" + "b 43f\n" + "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[4], [x28]\n" + "b 43f\n" + "41:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x4, #1, 42f\n" + "ld1 { v27.h }[0], [x28], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[2], [x28]\n" + "b 43f\n" + "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v27.b }[0], [x28]\n" + "43:" // Oddments: Load (3, 3): Bit 2: End + "ldr x27, [x8, #0x80]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "add x27, x27, x5\n" + "tbz x4, #2, 45f\n" + "ld1 { v28.s }[0], [x27], #0x4\n" + "tbz x4, #1, 44f\n" + "ld1 { v28.h }[2], [x27], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[6], [x27]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[4], [x27]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x4, #1, 46f\n" + "ld1 { v28.h }[0], [x27], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[2], [x27]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v28.b }[0], [x27]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "ldr x26, [x8, #0x88]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v20.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "add x26, x26, x5\n" + "smlal v17.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "tbz x4, #2, 49f\n" + "ld1 { v26.s }[0], [x26], #0x4\n" + "tbz x4, #1, 48f\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[6], [x26]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[4], [x26]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x4, #1, 50f\n" + "ld1 { v26.h }[0], [x26], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[2], [x26]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v26.b }[0], [x26]\n" + "51:" // Oddments: Load (3, 4): Bit 2: End + "ldr x25, [x8, #0x90]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "add x25, x25, x5\n" + "tbz x4, #2, 53f\n" + "ld1 { v25.s }[0], [x25], #0x4\n" + "tbz x4, #1, 52f\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[6], [x25]\n" + "b 55f\n" + "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[4], [x25]\n" + "b 55f\n" + "53:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x4, #1, 54f\n" + "ld1 { v25.h }[0], [x25], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[2], [x25]\n" + "b 55f\n" + "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v25.b }[0], [x25]\n" + "55:" // Oddments: Load (4, 0): Bit 2: End + "ldr x24, [x8, #0x98]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v25.4h, v6.4h\n" + "smlal2 v22.4s, v25.8h, v6.8h\n" + "add x24, x24, x5\n" + "tbz x4, #2, 57f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x4, #1, 56f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 59f\n" + "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 59f\n" + "57:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x4, #1, 58f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 59f\n" + "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v29.b }[0], [x24]\n" + "59:" // Oddments: Load (2, 4): Bit 2: End + "ldr x23, [x8, #0xa0]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v20.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "add x23, x23, x5\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "tbz x4, #2, 61f\n" + "ld1 { v27.s }[0], [x23], #0x4\n" + "tbz x4, #1, 60f\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[6], [x23]\n" + "b 63f\n" + "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[4], [x23]\n" + "b 63f\n" + "61:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x4, #1, 62f\n" + "ld1 { v27.h }[0], [x23], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[2], [x23]\n" + "b 63f\n" + "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v27.b }[0], [x23]\n" + "63:" // Oddments: Load (4, 1): Bit 2: End + "ldr x22, [x8, #0xa8]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v16.4s, v27.4h, v7.4h\n" + "smlal2 v22.4s, v27.8h, v7.8h\n" + "add x22, x22, x5\n" + "tbz x4, #2, 65f\n" + "ld1 { v24.s }[0], [x22], #0x4\n" + "tbz x4, #1, 64f\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[6], [x22]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[4], [x22]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x4, #1, 66f\n" + "ld1 { v24.h }[0], [x22], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[2], [x22]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v24.b }[0], [x22]\n" + "67:" // Oddments: Load (3, 2): Bit 2: End + "ldr x21, [x8, #0xb0]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v16.4s, v24.4h, v5.4h\n" + "smlal2 v22.4s, v24.8h, v5.8h\n" + "add x21, x21, x5\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "tbz x4, #2, 69f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x4, #1, 68f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[6], [x21]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[4], [x21]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x4, #1, 70f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[2], [x21]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v26.b }[0], [x21]\n" + "71:" // Oddments: Load (4, 3): Bit 2: End + "ldr x20, [x8, #0xb8]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v17.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "add x20, x20, x5\n" + "tbz x4, #2, 73f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x4, #1, 72f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x4, #1, 74f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v25.b }[0], [x20]\n" + "75:" // Oddments: Load (4, 2): Bit 2: End + "ldr x19, [x8, #0xc0]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v16.4s, v25.4h, v8.4h\n" + "smlal2 v22.4s, v25.8h, v8.8h\n" + "add x19, x19, x5\n" + "smlal v17.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "tbz x4, #2, 77f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x4, #1, 76f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[6], [x19]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[4], [x19]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x4, #1, 78f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[2], [x19]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v29.b }[0], [x19]\n" + "79:" // Oddments: Load (4, 4): Bit 2: End + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v17.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "tbz x4, #2, 81f\n" + "ld1 { v21.4s }, [x17], #0x10\n" + "ld1 { v30.4s }, [x15], #0x10\n" + "tbz x4, #1, 80f\n" + "ld1 { v31.d }[0], [x17], #0x8\n" + "ld1 { v9.d }[0], [x15], #0x8\n" + "tbz x4, #0, 83f\n" + "ld1 { v31.s }[2], [x17]\n" + "ld1 { v9.s }[2], [x15]\n" + "b 83f\n" + "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v31.s }[0], [x17]\n" + "ld1 { v9.s }[0], [x15]\n" + "b 83f\n" + "81:" // Oddments: Load requant params: Bit 2: Unset + "tbz x4, #1, 82f\n" + "ld1 { v21.d }[0], [x17], #0x8\n" + "ld1 { v30.d }[0], [x15], #0x8\n" + "tbz x4, #0, 83f\n" + "ld1 { v21.s }[2], [x17]\n" + "ld1 { v30.s }[2], [x15]\n" + "b 83f\n" + "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v21.s }[0], [x17]\n" + "ld1 { v30.s }[0], [x15]\n" + "83:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v15.4s, v15.4s, v21.4s\n" + "add x14, x14, x7\n" + "sqrdmulh v10.4s, v10.4s, v31.4s\n" + "add x13, x13, x7\n" + "sqrdmulh v20.4s, v20.4s, v21.4s\n" + "add x12, x12, x7\n" + "sqrdmulh v23.4s, v23.4s, v31.4s\n" + "add x11, x11, x7\n" + "sqrdmulh v16.4s, v16.4s, v21.4s\n" + "and v26.16b, v15.16b, v30.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v8.16b, v10.16b, v9.16b\n" + "and v4.16b, v20.16b, v30.16b\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v2.16b, v23.16b, v9.16b\n" + "and v1.16b, v16.16b, v30.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v31.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v26.4s\n" + "sqrdmulh v17.4s, v17.4s, v21.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "sqadd v10.4s, v10.4s, v8.4s\n" + "sqadd v20.4s, v20.4s, v4.4s\n" + "srshl v15.4s, v15.4s, v30.4s\n" + "sqadd v23.4s, v23.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "srshl v20.4s, v20.4s, v30.4s\n" + "add v15.4s, v15.4s, v11.4s\n" + "srshl v23.4s, v23.4s, v9.4s\n" + "add v10.4s, v10.4s, v11.4s\n" + "smin v15.4s, v15.4s, v14.4s\n" + "add v20.4s, v20.4s, v11.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "smax v15.4s, v15.4s, v19.4s\n" + "smin v20.4s, v20.4s, v14.4s\n" + "smax v10.4s, v10.4s, v19.4s\n" + "add v23.4s, v23.4s, v11.4s\n" + "smax v20.4s, v20.4s, v19.4s\n" + "uzp1 v15.16b, v15.16b, v10.16b\n" + "smin v23.4s, v23.4s, v14.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "sqadd v16.4s, v16.4s, v1.4s\n" + "smax v23.4s, v23.4s, v19.4s\n" + "and v24.16b, v22.16b, v9.16b\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v23.16b\n" + "srshl v16.4s, v16.4s, v30.4s\n" + "and v2.16b, v17.16b, v30.16b\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "add v16.4s, v16.4s, v11.4s\n" + "sqadd v22.4s, v22.4s, v24.4s\n" + "and v31.16b, v18.16b, v9.16b\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "smin v16.4s, v16.4s, v14.4s\n" + "srshl v22.4s, v22.4s, v9.4s\n" + "sqadd v17.4s, v17.4s, v2.4s\n" + "smax v16.4s, v16.4s, v19.4s\n" + "add v22.4s, v22.4s, v11.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqadd v18.4s, v18.4s, v31.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "add v17.4s, v17.4s, v11.4s\n" + "srshl v18.4s, v18.4s, v9.4s\n" + "smax v22.4s, v22.4s, v19.4s\n" + "smin v17.4s, v17.4s, v14.4s\n" + "uzp1 v16.16b, v16.16b, v22.16b\n" + "add v18.4s, v18.4s, v11.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "smax v17.4s, v17.4s, v19.4s\n" + "smin v18.4s, v18.4s, v14.4s\n" + "smax v18.4s, v18.4s, v19.4s\n" + "uzp1 v17.16b, v17.16b, v18.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "tbz x4, #2, 85f\n" + "st1 { v15.s }[0], [x14], #0x4\n" + "st1 { v20.s }[0], [x13], #0x4\n" + "st1 { v16.s }[0], [x12], #0x4\n" + "st1 { v17.s }[0], [x11], #0x4\n" + "tbz x4, #1, 84f\n" + "st1 { v15.h }[2], [x14], #0x2\n" + "st1 { v20.h }[2], [x13], #0x2\n" + "st1 { v16.h }[2], [x12], #0x2\n" + "st1 { v17.h }[2], [x11], #0x2\n" + "tbz x4, #0, 87f\n" + "st1 { v15.b }[6], [x14], #0x1\n" + "st1 { v20.b }[6], [x13], #0x1\n" + "st1 { v16.b }[6], [x12], #0x1\n" + "st1 { v17.b }[6], [x11], #0x1\n" + "b 87f\n" + "84:" // Oddments: Bit 2: Bit 1: Unset + "tbz x4, #0, 87f\n" + "st1 { v15.b }[4], [x14], #0x1\n" + "st1 { v20.b }[4], [x13], #0x1\n" + "st1 { v16.b }[4], [x12], #0x1\n" + "st1 { v17.b }[4], [x11], #0x1\n" + "b 87f\n" + "85:" // Oddments: Bit 2: Unset + "tbz x4, #1, 86f\n" + "st1 { v15.h }[0], [x14], #0x2\n" + "st1 { v20.h }[0], [x13], #0x2\n" + "st1 { v16.h }[0], [x12], #0x2\n" + "st1 { v17.h }[0], [x11], #0x2\n" + "tbz x4, #0, 87f\n" + "st1 { v15.b }[2], [x14], #0x1\n" + "st1 { v20.b }[2], [x13], #0x1\n" + "st1 { v16.b }[2], [x12], #0x1\n" + "st1 { v17.b }[2], [x11], #0x1\n" + "b 87f\n" + "86:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 87f\n" + "st1 { v15.b }[0], [x14], #0x1\n" + "st1 { v20.b }[0], [x13], #0x1\n" + "st1 { v16.b }[0], [x12], #0x1\n" + "st1 { v17.b }[0], [x11], #0x1\n" + "87:" // Oddments: Bit 2: End + + "88:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..73de9650c3 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_5x5_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_5x5_mla::get_packed_size; + + kern_type kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + + a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..699cc6c80c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,2213 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const uint8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x10, #0x0\n" + "ldr x3, [%x[params], %[offsetof_Params_weights]]\n" + "mov x1, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x25, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x19, x4, #0x3\n" + "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x13, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v7.16b }, [x13]\n" + "add x8, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v13.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v19.4s }, [x8]\n" + "add x8, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v16.4s }, [x20]\n" + "ld1r { v12.4s }, [x8]\n" + "ldp x17, x16, [x21, #0x0]\n" + "ldp x6, x8, [x21, #0x10]\n" + "cbz x19, 3f\n" + "subs x19, x19, #0x1\n" + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x12, #0x0]\n" + "mov v18.16b, v15.16b\n" + "ldr q20, [x12, #0x10]\n" + "add x12, x12, #0x20\n" + "mov v11.16b, v15.16b\n" + "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "mov v10.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "mov v5.16b, v20.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v8.16b, v20.16b\n" + "ldr d2, [x3, #0x10]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "mov v9.16b, v20.16b\n" + "ldr d3, [x3, #0x18]\n" + "ldr d4, [x3, #0x20]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "ldp x28, x27, [x25, #0x0]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "ldr d31, [x28, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "ldr d30, [x27, x10]\n" + "ldr d29, [x26, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "ldr d28, [x13, x10]\n" + "ldr d27, [x24, x10]\n" + "usubl v29.8h, v29.8b, v7.8b\n" + "ldr d23, [x23, x10]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "ldr d25, [x22, x10]\n" + "ldr d24, [x21, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "ldr d26, [x20, x10]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "ldr d22, [x0, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "usubl v22.8h, v22.8b, v7.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x20, [x25, #0x50]\n" + "subs x19, x19, #0x1\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr x28, [x25, #0x58]\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "ldr x0, [x25, #0x60]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ldr d31, [x20, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ldr x7, [x25, #0x68]\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "ldr x26, [x25, #0x70]\n" + "smlal v10.4s, v28.4h, v0.4h\n" + "ldr x23, [x25, #0x78]\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d0, [x3, #0x28]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "ldr x20, [x25, #0x80]\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v1.4h\n" + "ldr x22, [x25, #0x88]\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "ldr x13, [x25, #0x90]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "ldr x21, [x25, #0x98]\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "ldr x14, [x25, #0xa0]\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "smlal2 v9.4s, v23.8h, v1.8h\n" + "ldr d1, [x3, #0x30]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "ldr d27, [x0, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "ldr x0, [x25, #0xb8]\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr x15, [x25, #0xc0]\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "ldr x9, [x25, #0xc8]\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "ldr x27, [x25, #0xd0]\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr x28, [x25, #0xd8]\n" + "smlal2 v9.4s, v31.8h, v2.8h\n" + "ldr d2, [x3, #0x38]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr q6, [x2, #0x0]\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x7, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "ldr x12, [x25, #0xe0]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ldr q21, [x5, #0x0]\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "ldr q17, [x2, #0x10]\n" + "add x2, x2, #0x20\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "ldr q14, [x5, #0x10]\n" + "add x5, x5, #0x20\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v9.4s, v30.8h, v3.8h\n" + "ldr d3, [x3, #0x40]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x26, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "ldr x7, [x25, #0xe8]\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0x48]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v20.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v9.4s, v25.8h, v0.8h\n" + "ldr d0, [x3, #0x50]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x10]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v1.4h\n" + "ldr x23, [x25, #0xf8]\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v9.4s, v24.8h, v1.8h\n" + "ldr d1, [x3, #0x58]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v20.4s, v23.8h, v2.8h\n" + "ldr d23, [x20, x10]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v31.4h, v2.4h\n" + "ldr x22, [x25, #0x100]\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v9.4s, v27.8h, v2.8h\n" + "ldr d2, [x3, #0x60]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v20.4s, v31.8h, v3.8h\n" + "ldr d31, [x13, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v3.4h\n" + "ldr x20, [x25, #0x108]\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "smlal v10.4s, v23.4h, v3.4h\n" + "smlal2 v9.4s, v23.8h, v3.8h\n" + "ldr d3, [x3, #0x68]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "ldr x13, [x25, #0x110]\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d26, [x14, x10]\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "ldr x21, [x25, #0x118]\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr d4, [x3, #0x70]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v20.4s, v22.8h, v0.8h\n" + "ldr d22, [x0, x10]\n" + "usubl v22.8h, v22.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "smlal v10.4s, v30.4h, v0.4h\n" + "smlal2 v9.4s, v30.8h, v0.8h\n" + "ldr d0, [x3, #0x78]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v20.4s, v25.8h, v1.8h\n" + "ldr d25, [x11, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "smlal v10.4s, v26.4h, v1.4h\n" + "smlal2 v9.4s, v26.8h, v1.8h\n" + "ldr d1, [x3, #0x80]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v20.4s, v24.8h, v2.8h\n" + "ldr d24, [x24, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "ldr d2, [x3, #0x88]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "ldr d27, [x15, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "ldr d3, [x3, #0x90]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v20.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x10]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d28, [x12, x10]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v22.4h, v4.4h\n" + "smlal2 v9.4s, v22.8h, v4.8h\n" + "ldr d4, [x3, #0x98]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x27, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "smlal v10.4s, v23.4h, v0.4h\n" + "smlal2 v9.4s, v23.8h, v0.8h\n" + "ldr d0, [x3, #0xa0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d1, [x3, #0xa8]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v20.4s, v26.8h, v2.8h\n" + "ldr d26, [x7, x10]\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v30.4h, v2.4h\n" + "smlal2 v9.4s, v30.8h, v2.8h\n" + "ldr d2, [x3, #0xb0]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x26, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v28.4h, v3.4h\n" + "smlal2 v9.4s, v28.8h, v3.8h\n" + "ldr d3, [x3, #0xb8]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0xc0]\n" + "add x3, x3, #0xc8\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v20.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x10]\n" + "smlal v18.4s, v23.4h, v0.4h\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ldr d25, [x20, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v0.4h\n" + "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v20.4s, v23.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "ldr d24, [x13, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v1.4h\n" + "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v20.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x10]\n" + "add x10, x10, #0x8\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v20.4s, v30.8h, v3.8h\n" + "smlal v18.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v20.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "sqrdmulh v20.4s, v20.4s, v17.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v5.4s, v5.4s, v17.4s\n" + "and v1.16b, v15.16b, v21.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v29.16b, v20.16b, v14.16b\n" + "and v3.16b, v18.16b, v21.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v2.16b, v5.16b, v14.16b\n" + "sqrdmulh v11.4s, v11.4s, v6.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v1.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "and v0.16b, v11.16b, v21.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "sqadd v5.4s, v5.4s, v2.4s\n" + "and v27.16b, v8.16b, v14.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "add v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v21.4s\n" + "srshl v5.4s, v5.4s, v14.4s\n" + "smin v15.4s, v15.4s, v12.4s\n" + "add v20.4s, v20.4s, v19.4s\n" + "add v18.4s, v18.4s, v19.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smin v18.4s, v18.4s, v12.4s\n" + "add v5.4s, v5.4s, v19.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smin v5.4s, v5.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "sqadd v11.4s, v11.4s, v0.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x17, x1]\n" + "smax v5.4s, v5.4s, v16.4s\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v21.4s\n" + "and v30.16b, v10.16b, v21.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v5.16b\n" + "add v11.4s, v11.4s, v19.4s\n" + "srshl v8.4s, v8.4s, v14.4s\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d18, [x16, x1]\n" + "smin v11.4s, v11.4s, v12.4s\n" + "sqrdmulh v9.4s, v9.4s, v17.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "sqadd v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "smin v8.4s, v8.4s, v12.4s\n" + "and v6.16b, v9.16b, v14.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "smax v8.4s, v8.4s, v16.4s\n" + "srshl v10.4s, v10.4s, v21.4s\n" + "uzp1 v11.16b, v11.16b, v8.16b\n" + "add v10.4s, v10.4s, v19.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x6, x1]\n" + "smin v10.4s, v10.4s, v12.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "srshl v9.4s, v9.4s, v14.4s\n" + "add v9.4s, v9.4s, v19.4s\n" + "smin v9.4s, v9.4s, v12.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "str d10, [x8, x1]\n" + "add x1, x1, #0x8\n" + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x12, #0x0]\n" + "mov v18.16b, v15.16b\n" + "ldr q20, [x12, #0x10]\n" + "add x12, x12, #0x20\n" + "mov v11.16b, v15.16b\n" + "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "mov v10.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "mov v5.16b, v20.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v8.16b, v20.16b\n" + "ldr d2, [x3, #0x10]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "mov v9.16b, v20.16b\n" + "ldr d3, [x3, #0x18]\n" + "ldr d4, [x3, #0x20]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "ldp x28, x27, [x25, #0x0]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "ldr d31, [x28, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "ldr d30, [x27, x10]\n" + "ldr d29, [x26, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "ldr d28, [x13, x10]\n" + "ldr d27, [x24, x10]\n" + "usubl v29.8h, v29.8b, v7.8b\n" + "ldr d23, [x23, x10]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "ldr d25, [x22, x10]\n" + "ldr d24, [x21, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "ldr d26, [x20, x10]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "ldr d22, [x0, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "usubl v22.8h, v22.8b, v7.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x20, [x25, #0x50]\n" + "tst x4, #0x7\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr x28, [x25, #0x58]\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "ldr x0, [x25, #0x60]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ldr d31, [x20, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ldr x7, [x25, #0x68]\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "ldr x26, [x25, #0x70]\n" + "smlal v10.4s, v28.4h, v0.4h\n" + "ldr x23, [x25, #0x78]\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d0, [x3, #0x28]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "ldr x20, [x25, #0x80]\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v1.4h\n" + "ldr x22, [x25, #0x88]\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "ldr x13, [x25, #0x90]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "ldr x21, [x25, #0x98]\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "ldr x14, [x25, #0xa0]\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "smlal2 v9.4s, v23.8h, v1.8h\n" + "ldr d1, [x3, #0x30]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "ldr d27, [x0, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "ldr x0, [x25, #0xb8]\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr x15, [x25, #0xc0]\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "ldr x9, [x25, #0xc8]\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "ldr x27, [x25, #0xd0]\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr x28, [x25, #0xd8]\n" + "smlal2 v9.4s, v31.8h, v2.8h\n" + "ldr d2, [x3, #0x38]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr x12, [x25, #0xe0]\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x7, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "ldr x7, [x25, #0xe8]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ldr q6, [x2, #0x0]\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "ldr q21, [x5, #0x0]\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "ldr q17, [x2, #0x10]\n" + "add x2, x2, #0x20\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr q14, [x5, #0x10]\n" + "add x5, x5, #0x20\n" + "smlal2 v9.4s, v30.8h, v3.8h\n" + "ldr d3, [x3, #0x40]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x26, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "ldr x23, [x25, #0xf8]\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0x48]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v20.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v9.4s, v25.8h, v0.8h\n" + "ldr d0, [x3, #0x50]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x10]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v1.4h\n" + "ldr x22, [x25, #0x100]\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v9.4s, v24.8h, v1.8h\n" + "ldr d1, [x3, #0x58]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v20.4s, v23.8h, v2.8h\n" + "ldr d23, [x20, x10]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v31.4h, v2.4h\n" + "ldr x20, [x25, #0x108]\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v9.4s, v27.8h, v2.8h\n" + "ldr d2, [x3, #0x60]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v20.4s, v31.8h, v3.8h\n" + "ldr d31, [x13, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v3.4h\n" + "ldr x13, [x25, #0x110]\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "smlal v10.4s, v23.4h, v3.4h\n" + "smlal2 v9.4s, v23.8h, v3.8h\n" + "ldr d3, [x3, #0x68]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "ldr x21, [x25, #0x118]\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d26, [x14, x10]\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr d4, [x3, #0x70]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v20.4s, v22.8h, v0.8h\n" + "ldr d22, [x0, x10]\n" + "usubl v22.8h, v22.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "smlal v10.4s, v30.4h, v0.4h\n" + "smlal2 v9.4s, v30.8h, v0.8h\n" + "ldr d0, [x3, #0x78]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v20.4s, v25.8h, v1.8h\n" + "ldr d25, [x11, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "smlal v10.4s, v26.4h, v1.4h\n" + "smlal2 v9.4s, v26.8h, v1.8h\n" + "ldr d1, [x3, #0x80]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v20.4s, v24.8h, v2.8h\n" + "ldr d24, [x24, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "ldr d2, [x3, #0x88]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "ldr d27, [x15, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "ldr d3, [x3, #0x90]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v20.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x10]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v18.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d28, [x12, x10]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v22.4h, v4.4h\n" + "smlal2 v9.4s, v22.8h, v4.8h\n" + "ldr d4, [x3, #0x98]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "ldr d31, [x27, x10]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "smlal v10.4s, v23.4h, v0.4h\n" + "smlal2 v9.4s, v23.8h, v0.8h\n" + "ldr d0, [x3, #0xa0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v18.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d1, [x3, #0xa8]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v20.4s, v26.8h, v2.8h\n" + "ldr d26, [x7, x10]\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v30.4h, v2.4h\n" + "smlal2 v9.4s, v30.8h, v2.8h\n" + "ldr d2, [x3, #0xb0]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "ldr d25, [x26, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v28.4h, v3.4h\n" + "smlal2 v9.4s, v28.8h, v3.8h\n" + "ldr d3, [x3, #0xb8]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v18.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0xc0]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v20.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x10]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v18.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ldr d25, [x20, x10]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v0.4h\n" + "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v20.4s, v23.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "ldr d24, [x13, x10]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v1.4h\n" + "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v20.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x10]\n" + "add x10, x10, #0x8\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v20.4s, v30.8h, v3.8h\n" + "smlal v18.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v20.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v10.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "sqrdmulh v20.4s, v20.4s, v17.4s\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "sqrdmulh v5.4s, v5.4s, v17.4s\n" + "and v1.16b, v15.16b, v21.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v29.16b, v20.16b, v14.16b\n" + "and v3.16b, v18.16b, v21.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v2.16b, v5.16b, v14.16b\n" + "sqrdmulh v11.4s, v11.4s, v6.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v1.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "and v0.16b, v11.16b, v21.16b\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v21.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "sqadd v5.4s, v5.4s, v2.4s\n" + "and v27.16b, v8.16b, v14.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "add v15.4s, v15.4s, v19.4s\n" + "srshl v20.4s, v20.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v21.4s\n" + "srshl v5.4s, v5.4s, v14.4s\n" + "smin v15.4s, v15.4s, v12.4s\n" + "add v20.4s, v20.4s, v19.4s\n" + "add v18.4s, v18.4s, v19.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smin v18.4s, v18.4s, v12.4s\n" + "add v5.4s, v5.4s, v19.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smin v5.4s, v5.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "sqadd v11.4s, v11.4s, v0.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x17, x1]\n" + "smax v5.4s, v5.4s, v16.4s\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v21.4s\n" + "and v30.16b, v10.16b, v21.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v5.16b\n" + "add v11.4s, v11.4s, v19.4s\n" + "srshl v8.4s, v8.4s, v14.4s\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d18, [x16, x1]\n" + "smin v11.4s, v11.4s, v12.4s\n" + "sqrdmulh v9.4s, v9.4s, v17.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "sqadd v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "smin v8.4s, v8.4s, v12.4s\n" + "and v6.16b, v9.16b, v14.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "smax v8.4s, v8.4s, v16.4s\n" + "srshl v10.4s, v10.4s, v21.4s\n" + "uzp1 v11.16b, v11.16b, v8.16b\n" + "add v10.4s, v10.4s, v19.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x6, x1]\n" + "smin v10.4s, v10.4s, v12.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "srshl v9.4s, v9.4s, v14.4s\n" + "add v9.4s, v9.4s, v19.4s\n" + "smin v9.4s, v9.4s, v12.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "str d10, [x8, x1]\n" + "add x1, x1, #0x8\n" + "beq 124f\n" + "add x3, x3, #0xc8\n" + "3:" // Oddments + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x4, #2, 5f\n" + "ld1 { v15.4s }, [x12], #0x10\n" + "tbz x4, #1, 4f\n" + "ld1 { v20.d }[0], [x12], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v20.s }[2], [x12]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v20.s }[0], [x12]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x4, #1, 6f\n" + "ld1 { v15.d }[0], [x12], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[2], [x12]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[0], [x12]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v18.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "mov v5.16b, v20.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v11.16b, v15.16b\n" + "ldr d2, [x3, #0x10]\n" + "mov v8.16b, v20.16b\n" + "ldr d3, [x3, #0x18]\n" + "mov v10.16b, v15.16b\n" + "ldr d4, [x3, #0x20]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "mov v9.16b, v20.16b\n" + "ldp x28, x27, [x25, #0x0]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "add x28, x28, x10\n" + "add x27, x27, x10\n" + "add x26, x26, x10\n" + "add x13, x13, x10\n" + "add x24, x24, x10\n" + "add x23, x23, x10\n" + "add x22, x22, x10\n" + "add x21, x21, x10\n" + "add x20, x20, x10\n" + "add x0, x0, x10\n" + "tbz x4, #2, 9f\n" + "ld1 { v31.s }[0], [x28], #0x4\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v29.s }[0], [x26], #0x4\n" + "ld1 { v28.s }[0], [x13], #0x4\n" + "ld1 { v27.s }[0], [x24], #0x4\n" + "ld1 { v23.s }[0], [x23], #0x4\n" + "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v24.s }[0], [x21], #0x4\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "ld1 { v22.s }[0], [x0], #0x4\n" + "tbz x4, #1, 8f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x13], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v24.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v22.h }[2], [x0], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[6], [x28]\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x13]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v24.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" + "ld1 { v22.b }[6], [x0]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[4], [x28]\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x13]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v24.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" + "ld1 { v22.b }[4], [x0]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x4, #1, 10f\n" + "ld1 { v31.h }[0], [x28], #0x2\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v29.h }[0], [x26], #0x2\n" + "ld1 { v28.h }[0], [x13], #0x2\n" + "ld1 { v27.h }[0], [x24], #0x2\n" + "ld1 { v23.h }[0], [x23], #0x2\n" + "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v24.h }[0], [x21], #0x2\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "ld1 { v22.h }[0], [x0], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[2], [x28]\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x13]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v24.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" + "ld1 { v22.b }[2], [x0]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[0], [x28]\n" + "ld1 { v30.b }[0], [x27]\n" + "ld1 { v29.b }[0], [x26]\n" + "ld1 { v28.b }[0], [x13]\n" + "ld1 { v27.b }[0], [x24]\n" + "ld1 { v23.b }[0], [x23]\n" + "ld1 { v25.b }[0], [x22]\n" + "ld1 { v24.b }[0], [x21]\n" + "ld1 { v26.b }[0], [x20]\n" + "ld1 { v22.b }[0], [x0]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ldr x20, [x25, #0x50]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "usubl v29.8h, v29.8b, v7.8b\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v28.4h, v0.4h\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "usubl v22.8h, v22.8b, v7.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "add x20, x20, x10\n" + "smlal v18.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "smlal2 v9.4s, v23.8h, v1.8h\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "tbz x4, #2, 13f\n" + "ld1 { v31.s }[0], [x20], #0x4\n" + "tbz x4, #1, 12f\n" + "ld1 { v31.h }[2], [x20], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[6], [x20]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[4], [x20]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x4, #1, 14f\n" + "ld1 { v31.h }[0], [x20], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[2], [x20]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[0], [x20]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr x28, [x25, #0x58]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "add x28, x28, x10\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "smlal2 v9.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "tbz x4, #2, 17f\n" + "ld1 { v30.s }[0], [x28], #0x4\n" + "tbz x4, #1, 16f\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[6], [x28]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[4], [x28]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x4, #1, 18f\n" + "ld1 { v30.h }[0], [x28], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[2], [x28]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[0], [x28]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "smlal v15.4s, v24.4h, v4.4h\n" + "ldr x0, [x25, #0x60]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "add x0, x0, x10\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v9.4s, v30.8h, v3.8h\n" + "tbz x4, #2, 21f\n" + "ld1 { v27.s }[0], [x0], #0x4\n" + "tbz x4, #1, 20f\n" + "ld1 { v27.h }[2], [x0], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[6], [x0]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[4], [x0]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 5): Bit 2: Unset + "tbz x4, #1, 22f\n" + "ld1 { v27.h }[0], [x0], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[2], [x0]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[0], [x0]\n" + "23:" // Oddments: Load (0, 5): Bit 2: End + "smlal v11.4s, v30.4h, v4.4h\n" + "ldr d0, [x3, #0x28]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "ldr x7, [x25, #0x68]\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "add x7, x7, x10\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v20.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "tbz x4, #2, 25f\n" + "ld1 { v25.s }[0], [x7], #0x4\n" + "tbz x4, #1, 24f\n" + "ld1 { v25.h }[2], [x7], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[6], [x7]\n" + "b 27f\n" + "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[4], [x7]\n" + "b 27f\n" + "25:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x4, #1, 26f\n" + "ld1 { v25.h }[0], [x7], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[2], [x7]\n" + "b 27f\n" + "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[0], [x7]\n" + "27:" // Oddments: Load (2, 1): Bit 2: End + "ldr d1, [x3, #0x30]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "ldr x26, [x25, #0x70]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v25.8h, v0.8h\n" + "add x26, x26, x10\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "smlal v18.4s, v23.4h, v1.4h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "tbz x4, #2, 29f\n" + "ld1 { v24.s }[0], [x26], #0x4\n" + "tbz x4, #1, 28f\n" + "ld1 { v24.h }[2], [x26], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[6], [x26]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[4], [x26]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x4, #1, 30f\n" + "ld1 { v24.h }[0], [x26], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[2], [x26]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[0], [x26]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "ldr d2, [x3, #0x38]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "ldr x23, [x25, #0x78]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v1.8h\n" + "add x23, x23, x10\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v20.4s, v23.8h, v2.8h\n" + "smlal v18.4s, v31.4h, v2.4h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "tbz x4, #2, 33f\n" + "ld1 { v27.s }[0], [x23], #0x4\n" + "tbz x4, #1, 32f\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[6], [x23]\n" + "b 35f\n" + "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[4], [x23]\n" + "b 35f\n" + "33:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x4, #1, 34f\n" + "ld1 { v27.h }[0], [x23], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[2], [x23]\n" + "b 35f\n" + "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[0], [x23]\n" + "35:" // Oddments: Load (2, 3): Bit 2: End + "ldr d3, [x3, #0x40]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "ldr x20, [x25, #0x80]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v27.8h, v2.8h\n" + "add x20, x20, x10\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v20.4s, v31.8h, v3.8h\n" + "smlal v18.4s, v30.4h, v3.4h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "tbz x4, #2, 37f\n" + "ld1 { v23.s }[0], [x20], #0x4\n" + "tbz x4, #1, 36f\n" + "ld1 { v23.h }[2], [x20], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[6], [x20]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[4], [x20]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x4, #1, 38f\n" + "ld1 { v23.h }[0], [x20], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[2], [x20]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[0], [x20]\n" + "39:" // Oddments: Load (2, 4): Bit 2: End + "ldr d4, [x3, #0x48]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v10.4s, v23.4h, v3.4h\n" + "ldr x22, [x25, #0x88]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v23.8h, v3.8h\n" + "add x22, x22, x10\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v20.4s, v30.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "tbz x4, #2, 41f\n" + "ld1 { v28.s }[0], [x22], #0x4\n" + "tbz x4, #1, 40f\n" + "ld1 { v28.h }[2], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 5): Bit 2: Unset + "tbz x4, #1, 42f\n" + "ld1 { v28.h }[0], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 5): Bit 2: End + "ldr d0, [x3, #0x50]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr x13, [x25, #0x90]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "add x13, x13, x10\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v20.4s, v22.8h, v0.8h\n" + "smlal v18.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 45f\n" + "ld1 { v31.s }[0], [x13], #0x4\n" + "tbz x4, #1, 44f\n" + "ld1 { v31.h }[2], [x13], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[6], [x13]\n" + "b 47f\n" + "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[4], [x13]\n" + "b 47f\n" + "45:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x4, #1, 46f\n" + "ld1 { v31.h }[0], [x13], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[2], [x13]\n" + "b 47f\n" + "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[0], [x13]\n" + "47:" // Oddments: Load (3, 0): Bit 2: End + "ldr x21, [x25, #0x98]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "add x21, x21, x10\n" + "tbz x4, #2, 49f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x4, #1, 48f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x4, #1, 50f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[0], [x21]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "ldr d1, [x3, #0x58]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v10.4s, v30.4h, v0.4h\n" + "ldr x14, [x25, #0xa0]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v30.8h, v0.8h\n" + "add x14, x14, x10\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v20.4s, v25.8h, v1.8h\n" + "smlal v18.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "tbz x4, #2, 53f\n" + "ld1 { v26.s }[0], [x14], #0x4\n" + "tbz x4, #1, 52f\n" + "ld1 { v26.h }[2], [x14], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[6], [x14]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[4], [x14]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x4, #1, 54f\n" + "ld1 { v26.h }[0], [x14], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[2], [x14]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[0], [x14]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "ldr d2, [x3, #0x60]\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal v10.4s, v26.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v26.8h, v1.8h\n" + "add x11, x11, x10\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v20.4s, v24.8h, v2.8h\n" + "smlal v18.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "tbz x4, #2, 57f\n" + "ld1 { v25.s }[0], [x11], #0x4\n" + "tbz x4, #1, 56f\n" + "ld1 { v25.h }[2], [x11], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[6], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[4], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x4, #1, 58f\n" + "ld1 { v25.h }[0], [x11], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[0], [x11]\n" + "59:" // Oddments: Load (3, 3): Bit 2: End + "ldr d3, [x3, #0x68]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "add x24, x24, x10\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal v18.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 61f\n" + "ld1 { v24.s }[0], [x24], #0x4\n" + "tbz x4, #1, 60f\n" + "ld1 { v24.h }[2], [x24], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[6], [x24]\n" + "b 63f\n" + "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[4], [x24]\n" + "b 63f\n" + "61:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x4, #1, 62f\n" + "ld1 { v24.h }[0], [x24], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[2], [x24]\n" + "b 63f\n" + "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[0], [x24]\n" + "63:" // Oddments: Load (3, 4): Bit 2: End + "ldr d4, [x3, #0x70]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "ldr x0, [x25, #0xb8]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "add x0, x0, x10\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v20.4s, v23.8h, v4.8h\n" + "smlal v18.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 65f\n" + "ld1 { v22.s }[0], [x0], #0x4\n" + "tbz x4, #1, 64f\n" + "ld1 { v22.h }[2], [x0], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[6], [x0]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[4], [x0]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 5): Bit 2: Unset + "tbz x4, #1, 66f\n" + "ld1 { v22.h }[0], [x0], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[2], [x0]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[0], [x0]\n" + "67:" // Oddments: Load (3, 5): Bit 2: End + "ldr d0, [x3, #0x78]\n" + "usubl v22.8h, v22.8b, v7.8b\n" + "smlal v10.4s, v22.4h, v4.4h\n" + "ldr x15, [x25, #0xc0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v22.8h, v4.8h\n" + "add x15, x15, x10\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v20.4s, v31.8h, v0.8h\n" + "smlal v18.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "tbz x4, #2, 69f\n" + "ld1 { v27.s }[0], [x15], #0x4\n" + "tbz x4, #1, 68f\n" + "ld1 { v27.h }[2], [x15], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[6], [x15]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[4], [x15]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x4, #1, 70f\n" + "ld1 { v27.h }[0], [x15], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[2], [x15]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[0], [x15]\n" + "71:" // Oddments: Load (4, 0): Bit 2: End + "ldr x9, [x25, #0xc8]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "add x9, x9, x10\n" + "tbz x4, #2, 73f\n" + "ld1 { v23.s }[0], [x9], #0x4\n" + "tbz x4, #1, 72f\n" + "ld1 { v23.h }[2], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[6], [x9]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[4], [x9]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x4, #1, 74f\n" + "ld1 { v23.h }[0], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[2], [x9]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[0], [x9]\n" + "75:" // Oddments: Load (4, 1): Bit 2: End + "ldr d1, [x3, #0x80]\n" + "usubl v23.8h, v23.8b, v7.8b\n" + "smlal v10.4s, v23.4h, v0.4h\n" + "ldr x27, [x25, #0xd0]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v23.8h, v0.8h\n" + "add x27, x27, x10\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v20.4s, v30.8h, v1.8h\n" + "smlal v18.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "tbz x4, #2, 77f\n" + "ld1 { v31.s }[0], [x27], #0x4\n" + "tbz x4, #1, 76f\n" + "ld1 { v31.h }[2], [x27], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[6], [x27]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[4], [x27]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x4, #1, 78f\n" + "ld1 { v31.h }[0], [x27], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[2], [x27]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[0], [x27]\n" + "79:" // Oddments: Load (4, 2): Bit 2: End + "ldr d2, [x3, #0x88]\n" + "usubl v31.8h, v31.8b, v7.8b\n" + "smlal v10.4s, v31.4h, v1.4h\n" + "ldr x28, [x25, #0xd8]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "add x28, x28, x10\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v20.4s, v26.8h, v2.8h\n" + "smlal v18.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "tbz x4, #2, 81f\n" + "ld1 { v30.s }[0], [x28], #0x4\n" + "tbz x4, #1, 80f\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[6], [x28]\n" + "b 83f\n" + "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[4], [x28]\n" + "b 83f\n" + "81:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x4, #1, 82f\n" + "ld1 { v30.h }[0], [x28], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[2], [x28]\n" + "b 83f\n" + "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[0], [x28]\n" + "83:" // Oddments: Load (4, 3): Bit 2: End + "ldr d3, [x3, #0x90]\n" + "usubl v30.8h, v30.8b, v7.8b\n" + "smlal v10.4s, v30.4h, v2.4h\n" + "ldr x12, [x25, #0xe0]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v30.8h, v2.8h\n" + "add x12, x12, x10\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v20.4s, v25.8h, v3.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "tbz x4, #2, 85f\n" + "ld1 { v28.s }[0], [x12], #0x4\n" + "tbz x4, #1, 84f\n" + "ld1 { v28.h }[2], [x12], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[6], [x12]\n" + "b 87f\n" + "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[4], [x12]\n" + "b 87f\n" + "85:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x4, #1, 86f\n" + "ld1 { v28.h }[0], [x12], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[2], [x12]\n" + "b 87f\n" + "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[0], [x12]\n" + "87:" // Oddments: Load (4, 4): Bit 2: End + "ldr d4, [x3, #0x98]\n" + "usubl v28.8h, v28.8b, v7.8b\n" + "smlal v10.4s, v28.4h, v3.4h\n" + "ldr x7, [x25, #0xe8]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v28.8h, v3.8h\n" + "add x7, x7, x10\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v20.4s, v24.8h, v4.8h\n" + "smlal v18.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "tbz x4, #2, 89f\n" + "ld1 { v26.s }[0], [x7], #0x4\n" + "tbz x4, #1, 88f\n" + "ld1 { v26.h }[2], [x7], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[6], [x7]\n" + "b 91f\n" + "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[4], [x7]\n" + "b 91f\n" + "89:" // Oddments: Load (4, 5): Bit 2: Unset + "tbz x4, #1, 90f\n" + "ld1 { v26.h }[0], [x7], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[2], [x7]\n" + "b 91f\n" + "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[0], [x7]\n" + "91:" // Oddments: Load (4, 5): Bit 2: End + "ldr d0, [x3, #0xa0]\n" + "usubl v26.8h, v26.8b, v7.8b\n" + "smlal v10.4s, v26.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v9.4s, v26.8h, v4.8h\n" + "add x26, x26, x10\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v20.4s, v27.8h, v0.8h\n" + "smlal v18.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "tbz x4, #2, 93f\n" + "ld1 { v25.s }[0], [x26], #0x4\n" + "tbz x4, #1, 92f\n" + "ld1 { v25.h }[2], [x26], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[6], [x26]\n" + "b 95f\n" + "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[4], [x26]\n" + "b 95f\n" + "93:" // Oddments: Load (5, 0): Bit 2: Unset + "tbz x4, #1, 94f\n" + "ld1 { v25.h }[0], [x26], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[2], [x26]\n" + "b 95f\n" + "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[0], [x26]\n" + "95:" // Oddments: Load (5, 0): Bit 2: End + "ldr x23, [x25, #0xf8]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "add x23, x23, x10\n" + "tbz x4, #2, 97f\n" + "ld1 { v24.s }[0], [x23], #0x4\n" + "tbz x4, #1, 96f\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[6], [x23]\n" + "b 99f\n" + "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[4], [x23]\n" + "b 99f\n" + "97:" // Oddments: Load (5, 1): Bit 2: Unset + "tbz x4, #1, 98f\n" + "ld1 { v24.h }[0], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[2], [x23]\n" + "b 99f\n" + "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[0], [x23]\n" + "99:" // Oddments: Load (5, 1): Bit 2: End + "ldr d1, [x3, #0xa8]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v0.4h\n" + "ldr x22, [x25, #0x100]\n" + "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v0.8h\n" + "add x22, x22, x10\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v20.4s, v23.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "tbz x4, #2, 101f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x4, #1, 100f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[6], [x22]\n" + "b 103f\n" + "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[4], [x22]\n" + "b 103f\n" + "101:" // Oddments: Load (5, 2): Bit 2: Unset + "tbz x4, #1, 102f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[2], [x22]\n" + "b 103f\n" + "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[0], [x22]\n" + "103:" // Oddments: Load (5, 2): Bit 2: End + "ldr d2, [x3, #0xb0]\n" + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v1.4h\n" + "ldr x20, [x25, #0x108]\n" + "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v9.4s, v27.8h, v1.8h\n" + "add x20, x20, x10\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v20.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "tbz x4, #2, 105f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x4, #1, 104f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 107f\n" + "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 107f\n" + "105:" // Oddments: Load (5, 3): Bit 2: Unset + "tbz x4, #1, 106f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 107f\n" + "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[0], [x20]\n" + "107:" // Oddments: Load (5, 3): Bit 2: End + "ldr d3, [x3, #0xb8]\n" + "usubl v25.8h, v25.8b, v7.8b\n" + "smlal v10.4s, v25.4h, v2.4h\n" + "ldr x13, [x25, #0x110]\n" + "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v9.4s, v25.8h, v2.8h\n" + "add x13, x13, x10\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v20.4s, v30.8h, v3.8h\n" + "smlal v18.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 109f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "tbz x4, #1, 108f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[6], [x13]\n" + "b 111f\n" + "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[4], [x13]\n" + "b 111f\n" + "109:" // Oddments: Load (5, 4): Bit 2: Unset + "tbz x4, #1, 110f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[2], [x13]\n" + "b 111f\n" + "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[0], [x13]\n" + "111:" // Oddments: Load (5, 4): Bit 2: End + "ldr d4, [x3, #0xc0]\n" + "usubl v24.8h, v24.8b, v7.8b\n" + "smlal v10.4s, v24.4h, v3.4h\n" + "ldr x21, [x25, #0x118]\n" + "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "add x21, x21, x10\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v20.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 113f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x4, #1, 112f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[6], [x21]\n" + "b 115f\n" + "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[4], [x21]\n" + "b 115f\n" + "113:" // Oddments: Load (5, 5): Bit 2: Unset + "tbz x4, #1, 114f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[2], [x21]\n" + "b 115f\n" + "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[0], [x21]\n" + "115:" // Oddments: Load (5, 5): Bit 2: End + "usubl v27.8h, v27.8b, v7.8b\n" + "smlal v10.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "tbz x4, #2, 117f\n" + "ld1 { v6.4s }, [x2], #0x10\n" + "ld1 { v21.4s }, [x5], #0x10\n" + "tbz x4, #1, 116f\n" + "ld1 { v17.d }[0], [x2], #0x8\n" + "ld1 { v14.d }[0], [x5], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v17.s }[2], [x2]\n" + "ld1 { v14.s }[2], [x5]\n" + "b 119f\n" + "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v17.s }[0], [x2]\n" + "ld1 { v14.s }[0], [x5]\n" + "b 119f\n" + "117:" // Oddments: Load requant params: Bit 2: Unset + "tbz x4, #1, 118f\n" + "ld1 { v6.d }[0], [x2], #0x8\n" + "ld1 { v21.d }[0], [x5], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v6.s }[2], [x2]\n" + "ld1 { v21.s }[2], [x5]\n" + "b 119f\n" + "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v6.s }[0], [x2]\n" + "ld1 { v21.s }[0], [x5]\n" + "119:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "add x17, x17, x1\n" + "sqrdmulh v20.4s, v20.4s, v17.4s\n" + "add x16, x16, x1\n" + "sqrdmulh v18.4s, v18.4s, v6.4s\n" + "add x6, x6, x1\n" + "sqrdmulh v5.4s, v5.4s, v17.4s\n" + "add x8, x8, x1\n" + "sqrdmulh v11.4s, v11.4s, v6.4s\n" + "and v1.16b, v15.16b, v21.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v29.16b, v20.16b, v14.16b\n" + "and v3.16b, v18.16b, v21.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v2.16b, v5.16b, v14.16b\n" + "and v0.16b, v11.16b, v21.16b\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v1.4s\n" + "sqrdmulh v10.4s, v10.4s, v6.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "sqrdmulh v9.4s, v9.4s, v17.4s\n" + "sqadd v20.4s, v20.4s, v29.4s\n" + "sqadd v18.4s, v18.4s, v3.4s\n" + "srshl v15.4s, v15.4s, v21.4s\n" + "sqadd v5.4s, v5.4s, v2.4s\n" + "srshl v20.4s, v20.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v21.4s\n" + "add v15.4s, v15.4s, v19.4s\n" + "srshl v5.4s, v5.4s, v14.4s\n" + "add v20.4s, v20.4s, v19.4s\n" + "smin v15.4s, v15.4s, v12.4s\n" + "add v18.4s, v18.4s, v19.4s\n" + "smin v20.4s, v20.4s, v12.4s\n" + "smax v15.4s, v15.4s, v16.4s\n" + "smin v18.4s, v18.4s, v12.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "add v5.4s, v5.4s, v19.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "uzp1 v15.16b, v15.16b, v20.16b\n" + "smin v5.4s, v5.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "sqadd v11.4s, v11.4s, v0.4s\n" + "smax v5.4s, v5.4s, v16.4s\n" + "and v27.16b, v8.16b, v14.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v5.16b\n" + "srshl v11.4s, v11.4s, v21.4s\n" + "and v30.16b, v10.16b, v21.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "add v11.4s, v11.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v27.4s\n" + "and v6.16b, v9.16b, v14.16b\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "smin v11.4s, v11.4s, v12.4s\n" + "srshl v8.4s, v8.4s, v14.4s\n" + "sqadd v10.4s, v10.4s, v30.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "add v8.4s, v8.4s, v19.4s\n" + "srshl v10.4s, v10.4s, v21.4s\n" + "sqadd v9.4s, v9.4s, v6.4s\n" + "smin v8.4s, v8.4s, v12.4s\n" + "add v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v14.4s\n" + "smax v8.4s, v8.4s, v16.4s\n" + "smin v10.4s, v10.4s, v12.4s\n" + "uzp1 v11.16b, v11.16b, v8.16b\n" + "add v9.4s, v9.4s, v19.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smin v9.4s, v9.4s, v12.4s\n" + "smax v9.4s, v9.4s, v16.4s\n" + "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "tbz x4, #2, 121f\n" + "st1 { v15.s }[0], [x17], #0x4\n" + "st1 { v18.s }[0], [x16], #0x4\n" + "st1 { v11.s }[0], [x6], #0x4\n" + "st1 { v10.s }[0], [x8], #0x4\n" + "tbz x4, #1, 120f\n" + "st1 { v15.h }[2], [x17], #0x2\n" + "st1 { v18.h }[2], [x16], #0x2\n" + "st1 { v11.h }[2], [x6], #0x2\n" + "st1 { v10.h }[2], [x8], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v15.b }[6], [x17], #0x1\n" + "st1 { v18.b }[6], [x16], #0x1\n" + "st1 { v11.b }[6], [x6], #0x1\n" + "st1 { v10.b }[6], [x8], #0x1\n" + "b 123f\n" + "120:" // Oddments: Bit 2: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v15.b }[4], [x17], #0x1\n" + "st1 { v18.b }[4], [x16], #0x1\n" + "st1 { v11.b }[4], [x6], #0x1\n" + "st1 { v10.b }[4], [x8], #0x1\n" + "b 123f\n" + "121:" // Oddments: Bit 2: Unset + "tbz x4, #1, 122f\n" + "st1 { v15.h }[0], [x17], #0x2\n" + "st1 { v18.h }[0], [x16], #0x2\n" + "st1 { v11.h }[0], [x6], #0x2\n" + "st1 { v10.h }[0], [x8], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v15.b }[2], [x17], #0x1\n" + "st1 { v18.b }[2], [x16], #0x1\n" + "st1 { v11.b }[2], [x6], #0x1\n" + "st1 { v10.b }[2], [x8], #0x1\n" + "b 123f\n" + "122:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v15.b }[0], [x17], #0x1\n" + "st1 { v18.b }[0], [x16], #0x1\n" + "st1 { v11.b }[0], [x6], #0x1\n" + "st1 { v10.b }[0], [x8], #0x1\n" + "123:" // Oddments: Bit 2: End + + "124:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp new file mode 100644 index 0000000000..f5459c2ac1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); + +struct a64_u8q_nhwc_generic_output9_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int n_output_points = 9; + + kern_type kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl; + + a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..42d9b2f408 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const void *params, + const arm_gemm::Requantize32& qp, + const unsigned int n_points, + const unsigned int n_channels +) +{ + __asm__ __volatile__( + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v12.4s }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" + "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n" + "ld1r { v11.4s }, [x20]\n" + "ld1r { v10.16b }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v9.16b }, [x20]\n" + "ld1r { v8.4s }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n" + "ld1r { v7.4s }, [x20]\n" + "ld1r { v6.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n" + "mov x11, #0x0\n" + "ld1r { v5.4s }, [x19]\n" + "lsr x10, %x[n_channels], #0x2\n" + "cbz x10, 6f\n" + "1:" // Channel loop + "movi v27.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x11, #0x2\n" + "ldr q27, [%x[bias], x19]\n" + "2:" // Channel loop: Load bias: Done + "mov v26.16b, v27.16b\n" + "ldr s16, [%x[params]], #0x4\n" + "mov x20, %x[inptrs]\n" + "mov v25.16b, v27.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, %x[n_points], #0x1\n" + "mov v24.16b, v27.16b\n" + "ldr s4, [x9, x11]\n" + "mov v23.16b, v27.16b\n" + "mov v22.16b, v27.16b\n" + "ldr s3, [x28, x11]\n" + "mov v21.16b, v27.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v20.16b, v27.16b\n" + "ldr s2, [x27, x11]\n" + "mov v19.16b, v27.16b\n" + "usubl v16.8h, v16.8b, v9.8b\n" + "ldr s1, [x26, x11]\n" + "usubl v4.8h, v4.8b, v10.8b\n" + "ldp x25, x24, [x20], #0x10\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "ldr s0, [x25, x11]\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "ldr s31, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "ldr s30, [x23, x11]\n" + "ldr s29, [x22, x11]\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "ldr x21, [x20], #0x8\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "ldr s28, [x21, x11]\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "ble 4f\n" + "3:" // Channel loop: Planar loop + "smlal v27.4s, v4.4h, v16.4h\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, x19, #0x1\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "ldr s4, [x9, x11]\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "ldr s3, [x28, x11]\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "ldp x27, x26, [x20], #0x10\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "ldr s2, [x27, x11]\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "ldr s16, [%x[params]], #0x4\n" + "usubl v4.8h, v4.8b, v10.8b\n" + "ldr s1, [x26, x11]\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "ldp x25, x24, [x20], #0x10\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "ldr s0, [x25, x11]\n" + "usubl v16.8h, v16.8b, v9.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "ldr s31, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "ldr s30, [x23, x11]\n" + "ldr s29, [x22, x11]\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "ldr x21, [x20], #0x8\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "ldr s28, [x21, x11]\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "bgt 3b\n" + "4:" // Channel loop: Planar tail + "smlal v27.4s, v4.4h, v16.4h\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "cbz %x[rq_mul_ptr], 5f\n" + "lsl x19, x11, #0x2\n" + "ldr q6, [%x[rq_mul_ptr], x19]\n" + "ldr q5, [%x[rq_right_shift_ptr], x19]\n" + "cbz %x[rq_left_shift_ptr], 5f\n" + "ldr q7, [%x[rq_left_shift_ptr], x19]\n" + "5:" // Channel loop: Load quantisation parameters: Done + "sshl v27.4s, v27.4s, v7.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "sshl v26.4s, v26.4s, v7.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "sshl v25.4s, v25.4s, v7.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "sqrdmulh v27.4s, v27.4s, v6.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "sqrdmulh v25.4s, v25.4s, v6.4s\n" + "sshl v24.4s, v24.4s, v7.4s\n" + "and v16.16b, v27.16b, v5.16b\n" + "and v18.16b, v26.16b, v5.16b\n" + "and v17.16b, v25.16b, v5.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sqadd v26.4s, v26.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v17.4s\n" + "sqrdmulh v24.4s, v24.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v5.4s\n" + "srshl v26.4s, v26.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v5.4s\n" + "and v16.16b, v24.16b, v5.16b\n" + "add v27.4s, v27.4s, v8.4s\n" + "add v26.4s, v26.4s, v8.4s\n" + "add v25.4s, v25.4s, v8.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v27.4s, v27.4s, v12.4s\n" + "smax v26.4s, v26.4s, v12.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "smin v27.4s, v27.4s, v11.4s\n" + "smin v26.4s, v26.4s, v11.4s\n" + "smax v25.4s, v25.4s, v12.4s\n" + "srshl v24.4s, v24.4s, v5.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "smin v25.4s, v25.4s, v11.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x27, x11]\n" + "add v24.4s, v24.4s, v8.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x26, x11]\n" + "smax v24.4s, v24.4s, v12.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x25, x11]\n" + "sshl v23.4s, v23.4s, v7.4s\n" + "sshl v22.4s, v22.4s, v7.4s\n" + "smin v24.4s, v24.4s, v11.4s\n" + "sqrdmulh v23.4s, v23.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sshl v21.4s, v21.4s, v7.4s\n" + "and v17.16b, v23.16b, v5.16b\n" + "and v16.16b, v22.16b, v5.16b\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x24, x11]\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v21.16b, v5.16b\n" + "sshl v20.4s, v20.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v7.4s\n" + "srshl v23.4s, v23.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v5.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v6.4s\n" + "add v23.4s, v23.4s, v8.4s\n" + "add v22.4s, v22.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "and v17.16b, v20.16b, v5.16b\n" + "sqrdmulh v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v5.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v19.16b, v5.16b\n" + "smin v23.4s, v23.4s, v11.4s\n" + "add v21.4s, v21.4s, v8.4s\n" + "sqadd v20.4s, v20.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v22.4s, v22.4s, v12.4s\n" + "smax v21.4s, v21.4s, v12.4s\n" + "srshl v20.4s, v20.4s, v5.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "smin v21.4s, v21.4s, v11.4s\n" + "add v20.4s, v20.4s, v8.4s\n" + "srshl v19.4s, v19.4s, v5.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x23, x11]\n" + "add v19.4s, v19.4s, v8.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x22, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x21, x11]\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x20, x11]\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x19, x11]\n" + "add x11, x11, #0x4\n" + "cmp x11, x10, LSL #2\n" + "blt 1b\n" + "6:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 24f\n" + "movi v27.4s, #0x0\n" + "cbz %x[bias], 9f\n" + "add x19, %x[bias], x11, LSL #2\n" + "tbz %x[n_channels], #1, 7f\n" + "ld1 { v27.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v27.s }[2], [x19], #0x4\n" + "b 8f\n" + "7:" // Oddments: Load bias: Bit 1: Unset + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "8:" // Oddments: Load bias: Bit 1: End + + "9:" // Oddments: Load bias: Done + "mov v26.16b, v27.16b\n" + "ldr s16, [%x[params]], #0x4\n" + "mov x20, %x[inptrs]\n" + "mov v25.16b, v27.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "mov v24.16b, v27.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v23.16b, v27.16b\n" + "ldp x25, x24, [x20], #0x10\n" + "mov v22.16b, v27.16b\n" + "add x28, x28, x11\n" + "mov v21.16b, v27.16b\n" + "ldp x23, x22, [x20], #0x10\n" + "mov v20.16b, v27.16b\n" + "add x27, x27, x11\n" + "mov v19.16b, v27.16b\n" + "ldr x21, [x20], #0x8\n" + "usubl v16.8h, v16.8b, v9.8b\n" + "add x26, x26, x11\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 10f\n" + "ldr h4, [x9], #0x2\n" + "ldr h3, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" + "ldr h31, [x24], #0x2\n" + "ldr h30, [x23], #0x2\n" + "ldr h29, [x22], #0x2\n" + "ldr h28, [x21], #0x2\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v4.b }[2], [x9], #0x1\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v2.b }[2], [x27], #0x1\n" + "ld1 { v1.b }[2], [x26], #0x1\n" + "ld1 { v0.b }[2], [x25], #0x1\n" + "ld1 { v31.b }[2], [x24], #0x1\n" + "ld1 { v30.b }[2], [x23], #0x1\n" + "ld1 { v29.b }[2], [x22], #0x1\n" + "ld1 { v28.b }[2], [x21], #0x1\n" + "b 11f\n" + "10:" // Oddments: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 11f\n" + "ldr b4, [x9], #0x1\n" + "ldr b3, [x28], #0x1\n" + "ldr b2, [x27], #0x1\n" + "ldr b1, [x26], #0x1\n" + "ldr b0, [x25], #0x1\n" + "ldr b31, [x24], #0x1\n" + "ldr b30, [x23], #0x1\n" + "ldr b29, [x22], #0x1\n" + "ldr b28, [x21], #0x1\n" + "11:" // Oddments: Load: Bit 1: End + "usubl v4.8h, v4.8b, v10.8b\n" + "subs x19, %x[n_points], #0x1\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "ble 15f\n" + "12:" // Oddments: Planar loop + "smlal v27.4s, v4.4h, v16.4h\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "ldp x27, x26, [x20], #0x10\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "ldp x25, x24, [x20], #0x10\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "add x28, x28, x11\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "ldp x23, x22, [x20], #0x10\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "add x27, x27, x11\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "ldr x21, [x20], #0x8\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "add x26, x26, x11\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "ldr s16, [%x[params]], #0x4\n" + "add x25, x25, x11\n" + "usubl v16.8h, v16.8b, v9.8b\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr h4, [x9], #0x2\n" + "ldr h3, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" + "ldr h31, [x24], #0x2\n" + "ldr h30, [x23], #0x2\n" + "ldr h29, [x22], #0x2\n" + "ldr h28, [x21], #0x2\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v4.b }[2], [x9], #0x1\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v2.b }[2], [x27], #0x1\n" + "ld1 { v1.b }[2], [x26], #0x1\n" + "ld1 { v0.b }[2], [x25], #0x1\n" + "ld1 { v31.b }[2], [x24], #0x1\n" + "ld1 { v30.b }[2], [x23], #0x1\n" + "ld1 { v29.b }[2], [x22], #0x1\n" + "ld1 { v28.b }[2], [x21], #0x1\n" + "b 14f\n" + "13:" // Oddments: Planar loop: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 14f\n" + "ldr b4, [x9], #0x1\n" + "ldr b3, [x28], #0x1\n" + "ldr b2, [x27], #0x1\n" + "ldr b1, [x26], #0x1\n" + "ldr b0, [x25], #0x1\n" + "ldr b31, [x24], #0x1\n" + "ldr b30, [x23], #0x1\n" + "ldr b29, [x22], #0x1\n" + "ldr b28, [x21], #0x1\n" + "14:" // Oddments: Planar loop: Load: Bit 1: End + "usubl v4.8h, v4.8b, v10.8b\n" + "subs x19, x19, #0x1\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "bgt 12b\n" + "15:" // Oddments: Planar tail + "smlal v27.4s, v4.4h, v16.4h\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "cbz %x[rq_mul_ptr], 21f\n" + "add x21, %x[rq_mul_ptr], x11, LSL #2\n" + "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n" + "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v6.d }[0], [x21], #0x8\n" + "ld1 { v5.d }[0], [x20], #0x8\n" + "cbz %x[rq_left_shift_ptr], 16f\n" + "ld1 { v7.d }[0], [x19], #0x8\n" + "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v6.s }[2], [x21], #0x4\n" + "ld1 { v5.s }[2], [x20], #0x4\n" + "cbz %x[rq_left_shift_ptr], 17f\n" + "ld1 { v7.s }[2], [x19], #0x4\n" + "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done + "b 20f\n" + "18:" // Oddments: Load quantisation parameters: Bit 1: Unset + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v6.s }[0], [x21], #0x4\n" + "ld1 { v5.s }[0], [x20], #0x4\n" + "cbz %x[rq_left_shift_ptr], 19f\n" + "ld1 { v7.s }[0], [x19], #0x4\n" + "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done + + "20:" // Oddments: Load quantisation parameters: Bit 1: End + + "21:" // Oddments: Load quantisation parameters: Done + "sshl v27.4s, v27.4s, v7.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "add x27, x27, x11\n" + "sqrdmulh v27.4s, v27.4s, v6.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "sshl v26.4s, v26.4s, v7.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "add x26, x26, x11\n" + "sshl v25.4s, v25.4s, v7.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "sshl v24.4s, v24.4s, v7.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x25, x25, x11\n" + "and v16.16b, v27.16b, v5.16b\n" + "add x24, x24, x11\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "add x23, x23, x11\n" + "sqrdmulh v25.4s, v25.4s, v6.4s\n" + "add x22, x22, x11\n" + "sqrdmulh v24.4s, v24.4s, v6.4s\n" + "add x21, x21, x11\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add x20, x20, x11\n" + "and v18.16b, v26.16b, v5.16b\n" + "add x19, x19, x11\n" + "and v17.16b, v25.16b, v5.16b\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v24.16b, v5.16b\n" + "srshl v27.4s, v27.4s, v5.4s\n" + "sqadd v26.4s, v26.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v27.4s, v27.4s, v8.4s\n" + "srshl v26.4s, v26.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v5.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "smax v27.4s, v27.4s, v12.4s\n" + "add v26.4s, v26.4s, v8.4s\n" + "add v25.4s, v25.4s, v8.4s\n" + "srshl v24.4s, v24.4s, v5.4s\n" + "smin v27.4s, v27.4s, v11.4s\n" + "smax v26.4s, v26.4s, v12.4s\n" + "smax v25.4s, v25.4s, v12.4s\n" + "add v24.4s, v24.4s, v8.4s\n" + "smin v26.4s, v26.4s, v11.4s\n" + "smin v25.4s, v25.4s, v11.4s\n" + "smax v24.4s, v24.4s, v12.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v24.4s, v24.4s, v11.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sshl v23.4s, v23.4s, v7.4s\n" + "sshl v22.4s, v22.4s, v7.4s\n" + "sqrdmulh v23.4s, v23.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sshl v21.4s, v21.4s, v7.4s\n" + "sshl v20.4s, v20.4s, v7.4s\n" + "and v17.16b, v23.16b, v5.16b\n" + "and v16.16b, v22.16b, v5.16b\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v6.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v21.16b, v5.16b\n" + "and v17.16b, v20.16b, v5.16b\n" + "srshl v23.4s, v23.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v5.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "add v23.4s, v23.4s, v8.4s\n" + "add v22.4s, v22.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "sqadd v20.4s, v20.4s, v17.4s\n" + "smax v23.4s, v23.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v5.4s\n" + "srshl v20.4s, v20.4s, v5.4s\n" + "smin v23.4s, v23.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "add v21.4s, v21.4s, v8.4s\n" + "add v20.4s, v20.4s, v8.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smax v21.4s, v21.4s, v12.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v21.4s, v21.4s, v11.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "sshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "sqrdmulh v19.4s, v19.4s, v6.4s\n" + "and v16.16b, v19.16b, v5.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "srshl v19.4s, v19.4s, v5.4s\n" + "add v19.4s, v19.4s, v8.4s\n" + "smax v19.4s, v19.4s, v12.4s\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "tbz %x[n_channels], #1, 22f\n" + "st1 { v27.h }[0], [x27], #0x2\n" + "st1 { v26.h }[0], [x26], #0x2\n" + "st1 { v25.h }[0], [x25], #0x2\n" + "st1 { v24.h }[0], [x24], #0x2\n" + "st1 { v23.h }[0], [x23], #0x2\n" + "st1 { v22.h }[0], [x22], #0x2\n" + "st1 { v21.h }[0], [x21], #0x2\n" + "st1 { v20.h }[0], [x20], #0x2\n" + "st1 { v19.h }[0], [x19], #0x2\n" + "tbz %x[n_channels], #0, 23f\n" + "st1 { v27.b }[2], [x27], #0x1\n" + "st1 { v26.b }[2], [x26], #0x1\n" + "st1 { v25.b }[2], [x25], #0x1\n" + "st1 { v24.b }[2], [x24], #0x1\n" + "st1 { v23.b }[2], [x23], #0x1\n" + "st1 { v22.b }[2], [x22], #0x1\n" + "st1 { v21.b }[2], [x21], #0x1\n" + "st1 { v20.b }[2], [x20], #0x1\n" + "st1 { v19.b }[2], [x19], #0x1\n" + "b 23f\n" + "22:" // Oddments: Store: Bit 1: Unset + "tbz %x[n_channels], #0, 23f\n" + "st1 { v27.b }[0], [x27], #0x1\n" + "st1 { v26.b }[0], [x26], #0x1\n" + "st1 { v25.b }[0], [x25], #0x1\n" + "st1 { v24.b }[0], [x24], #0x1\n" + "st1 { v23.b }[0], [x23], #0x1\n" + "st1 { v22.b }[0], [x22], #0x1\n" + "st1 { v21.b }[0], [x21], #0x1\n" + "st1 { v20.b }[0], [x20], #0x1\n" + "st1 { v19.b }[0], [x19], #0x1\n" + "23:" // Oddments: Store: Bit 1: End + + "24:" // End + + : [params] "+&r" (params) + : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp new file mode 100644 index 0000000000..e8ac603928 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +{ + typedef uint32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 9; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + + a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..2106cf7086 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "movi v5.16b, #0x1\n" + "ldr x22, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "ushr v5.4s, v5.4s, #0x8\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "add x21, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "movi v26.4s, #0x0\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "mov x11, #0x0\n" + "movi v1.4s, #0x0\n" + "ld1 { v15.16b }, [x22]\n" + "mov x10, #0x0\n" + "movi v22.4s, #0x0\n" + "ld1 { v29.16b }, [x20]\n" + "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "movi v25.4s, #0x0\n" + "ld1 { v0.16b }, [x19]\n" + "add x28, %x[qp], %[offsetof_Requantize32_minval]\n" + "movi v13.4s, #0x0\n" + "ldr x20, [%x[inptrs], #0x18]\n" + "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n" + "mov v20.16b, v15.16b\n" + "ldr x19, [%x[inptrs], #0x20]\n" + "cmp %x[n_channels], #0x4\n" + "ext v20.16b, v20.16b, v20.16b, #0x2\n" + "ld1r { v4.4s }, [x21]\n" + "mov v17.16b, v15.16b\n" + "ld1 { v2.16b }, [x20]\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "ld1 { v7.16b }, [x19]\n" + "mov v23.16b, v15.16b\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "ext v23.16b, v23.16b, v23.16b, #0x6\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "mov v18.16b, v29.16b\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "zip1 v15.4s, v15.4s, v17.4s\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "ext v18.16b, v18.16b, v18.16b, #0x2\n" + "ld1r { v14.4s }, [x9]\n" + "zip1 v20.4s, v20.4s, v23.4s\n" + "ld1r { v27.4s }, [x28]\n" + "zip1 v15.4s, v15.4s, v20.4s\n" + "ld1r { v23.4s }, [x27]\n" + "mov v17.16b, v29.16b\n" + "ldr q6, [%x[params], #0x0]\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "ldr q8, [%x[params], #0x10]\n" + "mov v11.16b, v29.16b\n" + "ldr q9, [%x[params], #0x20]\n" + "ext v11.16b, v11.16b, v11.16b, #0x6\n" + "ldr q10, [%x[params], #0x30]\n" + "add %x[params], %x[params], #0x40\n" + "zip1 v29.4s, v29.4s, v17.4s\n" + "mov v12.16b, v0.16b\n" + "ext v12.16b, v12.16b, v12.16b, #0x2\n" + "zip1 v18.4s, v18.4s, v11.4s\n" + "zip1 v29.4s, v29.4s, v18.4s\n" + "mov v17.16b, v0.16b\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "mov v11.16b, v0.16b\n" + "ext v11.16b, v11.16b, v11.16b, #0x6\n" + "mov v18.16b, v2.16b\n" + "zip1 v0.4s, v0.4s, v17.4s\n" + "ext v18.16b, v18.16b, v18.16b, #0x2\n" + "zip1 v12.4s, v12.4s, v11.4s\n" + "zip1 v0.4s, v0.4s, v12.4s\n" + "mov v17.16b, v2.16b\n" + "ext v17.16b, v17.16b, v17.16b, #0x4\n" + "mov v19.16b, v2.16b\n" + "ext v19.16b, v19.16b, v19.16b, #0x6\n" + "mov v28.16b, v7.16b\n" + "zip1 v2.4s, v2.4s, v17.4s\n" + "ext v28.16b, v28.16b, v28.16b, #0x2\n" + "zip1 v18.4s, v18.4s, v19.4s\n" + "zip1 v2.4s, v2.4s, v18.4s\n" + "mov v18.16b, v7.16b\n" + "ext v18.16b, v18.16b, v18.16b, #0x4\n" + "mov v21.16b, v7.16b\n" + "ext v21.16b, v21.16b, v21.16b, #0x6\n" + "movi v30.4s, #0x0\n" + "zip1 v7.4s, v7.4s, v18.4s\n" + "movi v3.4s, #0x0\n" + "zip1 v28.4s, v28.4s, v21.4s\n" + "zip1 v7.4s, v7.4s, v28.4s\n" + "movi v12.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v31.4s, #0x0\n" + ".inst 0x6f8fe0ba // udot v26.4s, v5.16b, v15.4b[0]\n" + ".inst 0x6fafe0a1 // udot v1.4s, v5.16b, v15.4b[1]\n" + ".inst 0x6f8fe8b6 // udot v22.4s, v5.16b, v15.4b[2]\n" + ".inst 0x6fafe8b9 // udot v25.4s, v5.16b, v15.4b[3]\n" + ".inst 0x6f9de0ad // udot v13.4s, v5.16b, v29.4b[0]\n" + ".inst 0x6fbde0be // udot v30.4s, v5.16b, v29.4b[1]\n" + ".inst 0x6f9de8a3 // udot v3.4s, v5.16b, v29.4b[2]\n" + ".inst 0x6fbde8ac // udot v12.4s, v5.16b, v29.4b[3]\n" + ".inst 0x6f80e0ab // udot v11.4s, v5.16b, v0.4b[0]\n" + ".inst 0x6fa0e0b3 // udot v19.4s, v5.16b, v0.4b[1]\n" + ".inst 0x6f80e8b5 // udot v21.4s, v5.16b, v0.4b[2]\n" + ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" + ".inst 0x6f82e0b0 // udot v16.4s, v5.16b, v2.4b[0]\n" + ".inst 0x6fa2e0bc // udot v28.4s, v5.16b, v2.4b[1]\n" + ".inst 0x6f82e8b2 // udot v18.4s, v5.16b, v2.4b[2]\n" + ".inst 0x6fa2e8b4 // udot v20.4s, v5.16b, v2.4b[3]\n" + ".inst 0x6f87e0b8 // udot v24.4s, v5.16b, v7.4b[0]\n" + ".inst 0x6fa7e0bf // udot v31.4s, v5.16b, v7.4b[1]\n" + "mov v26.16b, v26.16b\n" + "mov v1.16b, v1.16b\n" + "mov v22.16b, v22.16b\n" + "mov v25.16b, v25.16b\n" + "add v26.4s, v26.4s, v13.4s\n" + "movi v13.4s, #0x0\n" + ".inst 0x6f87e8ad // udot v13.4s, v5.16b, v7.4b[2]\n" + "add v1.4s, v1.4s, v30.4s\n" + "movi v30.4s, #0x0\n" + ".inst 0x6fa7e8be // udot v30.4s, v5.16b, v7.4b[3]\n" + "add v22.4s, v22.4s, v3.4s\n" + "add v25.4s, v25.4s, v12.4s\n" + "add v26.4s, v26.4s, v11.4s\n" + "add v1.4s, v1.4s, v19.4s\n" + "add v22.4s, v22.4s, v21.4s\n" + "add v25.4s, v25.4s, v17.4s\n" + "mov v11.16b, v11.16b\n" + "mov v3.16b, v19.16b\n" + "mov v19.16b, v21.16b\n" + "mov v21.16b, v17.16b\n" + "add v11.4s, v11.4s, v16.4s\n" + "add v3.4s, v3.4s, v28.4s\n" + "add v19.4s, v19.4s, v18.4s\n" + "add v21.4s, v21.4s, v20.4s\n" + "add v11.4s, v11.4s, v24.4s\n" + "add v3.4s, v3.4s, v31.4s\n" + "add v19.4s, v19.4s, v13.4s\n" + "add v21.4s, v21.4s, v30.4s\n" + "neg v4.4s, v4.4s\n" + "mul v26.4s, v26.4s, v4.4s\n" + "str q26, [SP, #0x0]\n" + "mul v1.4s, v1.4s, v4.4s\n" + "mul v22.4s, v22.4s, v4.4s\n" + "str q1, [SP, #0x10]\n" + "mul v25.4s, v25.4s, v4.4s\n" + "mul v11.4s, v11.4s, v4.4s\n" + "str q22, [SP, #0x20]\n" + "mul v3.4s, v3.4s, v4.4s\n" + "str q25, [SP, #0x30]\n" + "mul v19.4s, v19.4s, v4.4s\n" + "mul v21.4s, v21.4s, v4.4s\n" + "str q11, [SP, #0x40]\n" + "add v26.4s, v26.4s, v6.4s\n" + "str q3, [SP, #0x50]\n" + "add v1.4s, v1.4s, v6.4s\n" + "str q19, [SP, #0x60]\n" + "add v22.4s, v22.4s, v6.4s\n" + "add v25.4s, v25.4s, v6.4s\n" + "str q21, [SP, #0x70]\n" + "add v11.4s, v11.4s, v6.4s\n" + "add v3.4s, v3.4s, v6.4s\n" + "add v19.4s, v19.4s, v6.4s\n" + "add v21.4s, v21.4s, v6.4s\n" + "ble 2f\n" + "1:" // Loop + ".inst 0x6f8fe11a // udot v26.4s, v8.16b, v15.4b[0]\n" + "ldr q20, [%x[params], #0x0]\n" + "add x11, x11, #0x10\n" + ".inst 0x6fafe101 // udot v1.4s, v8.16b, v15.4b[1]\n" + "ldr q4, [%x[params], #0x10]\n" + "sub %x[n_channels], %x[n_channels], #0x4\n" + ".inst 0x6f8fe916 // udot v22.4s, v8.16b, v15.4b[2]\n" + "ldr q6, [%x[params], #0x20]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x6fafe919 // udot v25.4s, v8.16b, v15.4b[3]\n" + ".inst 0x6f80e10b // udot v11.4s, v8.16b, v0.4b[0]\n" + ".inst 0x6fa0e103 // udot v3.4s, v8.16b, v0.4b[1]\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + ".inst 0x6fa0e915 // udot v21.4s, v8.16b, v0.4b[3]\n" + "ldr q8, [%x[params], #0x30]\n" + ".inst 0x6f9de13a // udot v26.4s, v9.16b, v29.4b[0]\n" + ".inst 0x6fbde121 // udot v1.4s, v9.16b, v29.4b[1]\n" + ".inst 0x6f9de936 // udot v22.4s, v9.16b, v29.4b[2]\n" + ".inst 0x6fbde939 // udot v25.4s, v9.16b, v29.4b[3]\n" + ".inst 0x6f82e12b // udot v11.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6fa2e123 // udot v3.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6f82e933 // udot v19.4s, v9.16b, v2.4b[2]\n" + ".inst 0x6fa2e935 // udot v21.4s, v9.16b, v2.4b[3]\n" + "ldr q9, [%x[params], #0x40]\n" + ".inst 0x6f80e15a // udot v26.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6fa0e141 // udot v1.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6f80e956 // udot v22.4s, v10.16b, v0.4b[2]\n" + ".inst 0x6fa0e959 // udot v25.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6f87e14b // udot v11.4s, v10.16b, v7.4b[0]\n" + ".inst 0x6fa7e143 // udot v3.4s, v10.16b, v7.4b[1]\n" + ".inst 0x6f87e953 // udot v19.4s, v10.16b, v7.4b[2]\n" + ".inst 0x6fa7e955 // udot v21.4s, v10.16b, v7.4b[3]\n" + "ldr q10, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x60\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v1.4s, v1.4s, v20.4s\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "sqrdmulh v11.4s, v11.4s, v20.4s\n" + "and v30.16b, v26.16b, v4.16b\n" + "and v17.16b, v1.16b, v4.16b\n" + "and v16.16b, v22.16b, v4.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v30.4s\n" + "sqadd v1.4s, v1.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v25.16b, v4.16b\n" + "srshl v26.4s, v26.4s, v4.4s\n" + "srshl v1.4s, v1.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v4.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v26.4s, v26.4s, v14.4s\n" + "add v1.4s, v1.4s, v14.4s\n" + "add v22.4s, v22.4s, v14.4s\n" + "smin v26.4s, v26.4s, v23.4s\n" + "smin v1.4s, v1.4s, v23.4s\n" + "smin v22.4s, v22.4s, v23.4s\n" + "smax v26.4s, v26.4s, v27.4s\n" + "smax v1.4s, v1.4s, v27.4s\n" + "smax v22.4s, v22.4s, v27.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x26, x10]\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q26, [SP, #0x0]\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "str s1, [x25, x10]\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "ldr q1, [SP, #0x10]\n" + "and v16.16b, v11.16b, v4.16b\n" + "str s22, [x24, x10]\n" + "sqrdmulh v3.4s, v3.4s, v20.4s\n" + "ldr q22, [SP, #0x20]\n" + "srshl v25.4s, v25.4s, v4.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" + "and v17.16b, v3.16b, v4.16b\n" + "add v25.4s, v25.4s, v14.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v23.4s\n" + "and v16.16b, v19.16b, v4.16b\n" + "srshl v11.4s, v11.4s, v4.4s\n" + "smax v25.4s, v25.4s, v27.4s\n" + "sqadd v3.4s, v3.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v11.4s, v11.4s, v14.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x23, x10]\n" + "smin v11.4s, v11.4s, v23.4s\n" + "srshl v3.4s, v3.4s, v4.4s\n" + "ldr q25, [SP, #0x30]\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v11.4s, v11.4s, v27.4s\n" + "add v3.4s, v3.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v4.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "smin v3.4s, v3.4s, v23.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str s11, [x22, x10]\n" + "smax v3.4s, v3.4s, v27.4s\n" + "add v19.4s, v19.4s, v14.4s\n" + "ldr q11, [SP, #0x40]\n" + "and v16.16b, v21.16b, v4.16b\n" + "add v26.4s, v26.4s, v6.4s\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "smin v19.4s, v19.4s, v23.4s\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "str s3, [x21, x10]\n" + "smax v19.4s, v19.4s, v27.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr q3, [SP, #0x50]\n" + "add v1.4s, v1.4s, v6.4s\n" + "add v22.4s, v22.4s, v6.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x20, x10]\n" + "add v25.4s, v25.4s, v6.4s\n" + "add v11.4s, v11.4s, v6.4s\n" + "ldr q19, [SP, #0x60]\n" + "srshl v21.4s, v21.4s, v4.4s\n" + "add v3.4s, v3.4s, v6.4s\n" + "add v21.4s, v21.4s, v14.4s\n" + "add v19.4s, v19.4s, v6.4s\n" + "smin v21.4s, v21.4s, v23.4s\n" + "smax v21.4s, v21.4s, v27.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x19, x10]\n" + "add x10, x10, #0x4\n" + "ldr q21, [SP, #0x70]\n" + "add v21.4s, v21.4s, v6.4s\n" + "bgt 1b\n" + "2:" // Tail + ".inst 0x6f8fe11a // udot v26.4s, v8.16b, v15.4b[0]\n" + "ldr q20, [%x[params], #0x0]\n" + "add x26, x26, x10\n" + ".inst 0x6fafe101 // udot v1.4s, v8.16b, v15.4b[1]\n" + "ldr q4, [%x[params], #0x10]\n" + "add x25, x25, x10\n" + ".inst 0x6f8fe916 // udot v22.4s, v8.16b, v15.4b[2]\n" + "add x24, x24, x10\n" + ".inst 0x6fafe919 // udot v25.4s, v8.16b, v15.4b[3]\n" + "add x23, x23, x10\n" + ".inst 0x6f80e10b // udot v11.4s, v8.16b, v0.4b[0]\n" + "add x22, x22, x10\n" + ".inst 0x6fa0e103 // udot v3.4s, v8.16b, v0.4b[1]\n" + "add x21, x21, x10\n" + ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n" + "add x20, x20, x10\n" + ".inst 0x6fa0e915 // udot v21.4s, v8.16b, v0.4b[3]\n" + "add x19, x19, x10\n" + ".inst 0x6f9de13a // udot v26.4s, v9.16b, v29.4b[0]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x6fbde121 // udot v1.4s, v9.16b, v29.4b[1]\n" + "add %x[params], %x[params], #0x20\n" + ".inst 0x6f9de936 // udot v22.4s, v9.16b, v29.4b[2]\n" + ".inst 0x6fbde939 // udot v25.4s, v9.16b, v29.4b[3]\n" + ".inst 0x6f82e12b // udot v11.4s, v9.16b, v2.4b[0]\n" + ".inst 0x6fa2e123 // udot v3.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6f82e933 // udot v19.4s, v9.16b, v2.4b[2]\n" + ".inst 0x6fa2e935 // udot v21.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6f80e15a // udot v26.4s, v10.16b, v0.4b[0]\n" + ".inst 0x6fa0e141 // udot v1.4s, v10.16b, v0.4b[1]\n" + ".inst 0x6f80e956 // udot v22.4s, v10.16b, v0.4b[2]\n" + ".inst 0x6fa0e959 // udot v25.4s, v10.16b, v0.4b[3]\n" + ".inst 0x6f87e14b // udot v11.4s, v10.16b, v7.4b[0]\n" + ".inst 0x6fa7e143 // udot v3.4s, v10.16b, v7.4b[1]\n" + ".inst 0x6f87e953 // udot v19.4s, v10.16b, v7.4b[2]\n" + ".inst 0x6fa7e955 // udot v21.4s, v10.16b, v7.4b[3]\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v1.4s, v1.4s, v20.4s\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "and v30.16b, v26.16b, v4.16b\n" + "and v17.16b, v1.16b, v4.16b\n" + "and v16.16b, v22.16b, v4.16b\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v26.4s, v26.4s, v30.4s\n" + "sqadd v1.4s, v1.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v25.16b, v4.16b\n" + "srshl v26.4s, v26.4s, v4.4s\n" + "srshl v1.4s, v1.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v4.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v26.4s, v26.4s, v14.4s\n" + "add v1.4s, v1.4s, v14.4s\n" + "add v22.4s, v22.4s, v14.4s\n" + "smin v26.4s, v26.4s, v23.4s\n" + "smin v1.4s, v1.4s, v23.4s\n" + "smin v22.4s, v22.4s, v23.4s\n" + "smax v26.4s, v26.4s, v27.4s\n" + "smax v1.4s, v1.4s, v27.4s\n" + "smax v22.4s, v22.4s, v27.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v1.16b, v1.16b, v1.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v11.4s, v11.4s, v20.4s\n" + "sqrdmulh v3.4s, v3.4s, v20.4s\n" + "srshl v25.4s, v25.4s, v4.4s\n" + "sqrdmulh v19.4s, v19.4s, v20.4s\n" + "and v16.16b, v11.16b, v4.16b\n" + "and v17.16b, v3.16b, v4.16b\n" + "add v25.4s, v25.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v23.4s\n" + "sqadd v11.4s, v11.4s, v16.4s\n" + "sqadd v3.4s, v3.4s, v17.4s\n" + "smax v25.4s, v25.4s, v27.4s\n" + "and v16.16b, v19.16b, v4.16b\n" + "srshl v11.4s, v11.4s, v4.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "srshl v3.4s, v3.4s, v4.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v11.4s, v11.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v3.4s, v3.4s, v14.4s\n" + "smin v11.4s, v11.4s, v23.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v3.4s, v3.4s, v23.4s\n" + "smax v11.4s, v11.4s, v27.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v3.4s, v3.4s, v27.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "srshl v19.4s, v19.4s, v4.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "and v16.16b, v21.16b, v4.16b\n" + "uzp1 v3.16b, v3.16b, v3.16b\n" + "add v19.4s, v19.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v19.4s, v19.4s, v23.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v19.4s, v19.4s, v27.4s\n" + "srshl v21.4s, v21.4s, v4.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "add v21.4s, v21.4s, v14.4s\n" + "smin v21.4s, v21.4s, v23.4s\n" + "smax v21.4s, v21.4s, v27.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "blt 3f\n" + "str s26, [x26, #0x0]\n" + "str s1, [x25, #0x0]\n" + "str s22, [x24, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s11, [x22, #0x0]\n" + "str s3, [x21, #0x0]\n" + "str s19, [x20, #0x0]\n" + "str s21, [x19, #0x0]\n" + "b 4f\n" + "3:" // Tail: Oddments + "st1 { v26.b }[0], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[0], [x25], #0x1\n" + "st1 { v22.b }[0], [x24], #0x1\n" + "st1 { v25.b }[0], [x23], #0x1\n" + "st1 { v11.b }[0], [x22], #0x1\n" + "st1 { v3.b }[0], [x21], #0x1\n" + "st1 { v19.b }[0], [x20], #0x1\n" + "st1 { v21.b }[0], [x19], #0x1\n" + "beq 4f\n" + "st1 { v26.b }[1], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[1], [x25], #0x1\n" + "st1 { v22.b }[1], [x24], #0x1\n" + "st1 { v25.b }[1], [x23], #0x1\n" + "st1 { v11.b }[1], [x22], #0x1\n" + "st1 { v3.b }[1], [x21], #0x1\n" + "st1 { v19.b }[1], [x20], #0x1\n" + "st1 { v21.b }[1], [x19], #0x1\n" + "beq 4f\n" + "st1 { v26.b }[2], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[2], [x25], #0x1\n" + "st1 { v22.b }[2], [x24], #0x1\n" + "st1 { v25.b }[2], [x23], #0x1\n" + "st1 { v11.b }[2], [x22], #0x1\n" + "st1 { v3.b }[2], [x21], #0x1\n" + "st1 { v19.b }[2], [x20], #0x1\n" + "st1 { v21.b }[2], [x19], #0x1\n" + "beq 4f\n" + "st1 { v26.b }[3], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v1.b }[3], [x25], #0x1\n" + "st1 { v22.b }[3], [x24], #0x1\n" + "st1 { v25.b }[3], [x23], #0x1\n" + "st1 { v11.b }[3], [x22], #0x1\n" + "st1 { v3.b }[3], [x21], #0x1\n" + "st1 { v19.b }[3], [x20], #0x1\n" + "st1 { v21.b }[3], [x19], #0x1\n" + "4:" // Tail: End + "add SP, SP, #0x80\n" + : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params) + : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..c5e0417c20 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +{ + typedef uint32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 8; + constexpr static unsigned int input_cols = 6; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + + a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..8bcd682e3c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "movi v15.16b, #0x1\n" + "ldr x21, [%x[inptrs], #0x0]\n" + "add SP, SP, #-0x80\n" + "movi v14.4s, #0x1\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "add x22, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "movi v28.4s, #0x0\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "mov x11, #0x0\n" + "movi v27.4s, #0x0\n" + "ld1 { v13.16b }, [x21]\n" + "mov x10, #0x0\n" + "movi v26.4s, #0x0\n" + "ld1 { v12.16b }, [x20]\n" + "add x9, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "movi v25.4s, #0x0\n" + "ld1 { v7.16b }, [x19]\n" + "add x28, %x[qp], %[offsetof_Requantize32_minval]\n" + "movi v24.4s, #0x0\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "add x27, %x[qp], %[offsetof_Requantize32_maxval]\n" + "mov v18.16b, v13.16b\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "cmp %x[n_channels], #0x4\n" + "ext v18.16b, v18.16b, v18.16b, #0x1\n" + "ldr x19, [%x[inptrs], #0x28]\n" + "mov v17.16b, v12.16b\n" + "ld1 { v6.16b }, [x21]\n" + "ext v17.16b, v17.16b, v17.16b, #0x1\n" + "ld1 { v5.16b }, [x20]\n" + "mov v16.16b, v7.16b\n" + "ld1 { v4.16b }, [x19]\n" + "ext v16.16b, v16.16b, v16.16b, #0x1\n" + "ldr x20, [%x[inptrs], #0x30]\n" + "zip1 v13.2d, v13.2d, v18.2d\n" + "ldr x19, [%x[inptrs], #0x38]\n" + "zip1 v12.2d, v12.2d, v17.2d\n" + "ld1r { v3.4s }, [x22]\n" + "mov v18.16b, v6.16b\n" + "ld1 { v2.16b }, [x20]\n" + "zip1 v7.2d, v7.2d, v16.2d\n" + "ld1 { v1.16b }, [x19]\n" + "ext v18.16b, v18.16b, v18.16b, #0x1\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "mov v17.16b, v5.16b\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "ext v17.16b, v17.16b, v17.16b, #0x1\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "mov v16.16b, v4.16b\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "zip1 v6.2d, v6.2d, v18.2d\n" + "ld1r { v0.4s }, [x9]\n" + "ext v16.16b, v16.16b, v16.16b, #0x1\n" + "ld1r { v31.4s }, [x28]\n" + "zip1 v5.2d, v5.2d, v17.2d\n" + "ld1r { v30.4s }, [x27]\n" + "mov v17.16b, v2.16b\n" + "ldr q29, [%x[params], #0x0]\n" + "ext v17.16b, v17.16b, v17.16b, #0x1\n" + "ldr q8, [%x[params], #0x10]\n" + "zip1 v4.2d, v4.2d, v16.2d\n" + "ldr q9, [%x[params], #0x20]\n" + "mov v16.16b, v1.16b\n" + "ldr q10, [%x[params], #0x30]\n" + "ext v16.16b, v16.16b, v16.16b, #0x1\n" + "ldr q11, [%x[params], #0x40]\n" + "add %x[params], %x[params], #0x50\n" + "zip1 v2.2d, v2.2d, v17.2d\n" + "movi v23.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "zip1 v1.2d, v1.2d, v16.2d\n" + "movi v21.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v19.4s, #0x0\n" + ".inst 0x6f8de1fc // udot v28.4s, v15.16b, v13.4b[0]\n" + ".inst 0x6f8de9fb // udot v27.4s, v15.16b, v13.4b[2]\n" + ".inst 0x6f8ce1fa // udot v26.4s, v15.16b, v12.4b[0]\n" + ".inst 0x6f8ce9f9 // udot v25.4s, v15.16b, v12.4b[2]\n" + ".inst 0x6fade1dc // udot v28.4s, v14.16b, v13.4b[1]\n" + ".inst 0x6fade9db // udot v27.4s, v14.16b, v13.4b[3]\n" + ".inst 0x6face1da // udot v26.4s, v14.16b, v12.4b[1]\n" + ".inst 0x6face9d9 // udot v25.4s, v14.16b, v12.4b[3]\n" + ".inst 0x6f87e1f8 // udot v24.4s, v15.16b, v7.4b[0]\n" + ".inst 0x6f87e9f7 // udot v23.4s, v15.16b, v7.4b[2]\n" + ".inst 0x6f86e1f6 // udot v22.4s, v15.16b, v6.4b[0]\n" + ".inst 0x6f86e9f5 // udot v21.4s, v15.16b, v6.4b[2]\n" + ".inst 0x6fa7e1d8 // udot v24.4s, v14.16b, v7.4b[1]\n" + ".inst 0x6fa7e9d7 // udot v23.4s, v14.16b, v7.4b[3]\n" + ".inst 0x6fa6e1d6 // udot v22.4s, v14.16b, v6.4b[1]\n" + ".inst 0x6fa6e9d5 // udot v21.4s, v14.16b, v6.4b[3]\n" + ".inst 0x6f85e1f2 // udot v18.4s, v15.16b, v5.4b[0]\n" + ".inst 0x6f85e9f1 // udot v17.4s, v15.16b, v5.4b[2]\n" + ".inst 0x6f84e1f0 // udot v16.4s, v15.16b, v4.4b[0]\n" + ".inst 0x6f84e9f4 // udot v20.4s, v15.16b, v4.4b[2]\n" + ".inst 0x6fa5e1d2 // udot v18.4s, v14.16b, v5.4b[1]\n" + ".inst 0x6fa5e9d1 // udot v17.4s, v14.16b, v5.4b[3]\n" + ".inst 0x6fa4e1d0 // udot v16.4s, v14.16b, v4.4b[1]\n" + ".inst 0x6fa4e9d4 // udot v20.4s, v14.16b, v4.4b[3]\n" + ".inst 0x6f82e1f3 // udot v19.4s, v15.16b, v2.4b[0]\n" + "mov v28.16b, v28.16b\n" + "mov v27.16b, v27.16b\n" + "add v28.4s, v28.4s, v26.4s\n" + ".inst 0x6fa2e1d3 // udot v19.4s, v14.16b, v2.4b[1]\n" + "add v27.4s, v27.4s, v25.4s\n" + "add v28.4s, v28.4s, v24.4s\n" + "mov v26.16b, v26.16b\n" + "add v27.4s, v27.4s, v23.4s\n" + "add v28.4s, v28.4s, v22.4s\n" + "mov v25.16b, v25.16b\n" + "add v27.4s, v27.4s, v21.4s\n" + "add v28.4s, v28.4s, v18.4s\n" + "add v26.4s, v26.4s, v24.4s\n" + "add v27.4s, v27.4s, v17.4s\n" + "add v25.4s, v25.4s, v23.4s\n" + "add v26.4s, v26.4s, v22.4s\n" + "mov v24.16b, v24.16b\n" + "add v25.4s, v25.4s, v21.4s\n" + "add v26.4s, v26.4s, v18.4s\n" + "mov v23.16b, v23.16b\n" + "add v25.4s, v25.4s, v17.4s\n" + "add v26.4s, v26.4s, v16.4s\n" + "add v24.4s, v24.4s, v22.4s\n" + "add v25.4s, v25.4s, v20.4s\n" + "add v23.4s, v23.4s, v21.4s\n" + "add v24.4s, v24.4s, v18.4s\n" + "mov v22.16b, v22.16b\n" + "add v23.4s, v23.4s, v17.4s\n" + "add v24.4s, v24.4s, v16.4s\n" + "mov v21.16b, v21.16b\n" + "add v23.4s, v23.4s, v20.4s\n" + "add v24.4s, v24.4s, v19.4s\n" + "add v22.4s, v22.4s, v18.4s\n" + "movi v18.4s, #0x0\n" + ".inst 0x6f82e9f2 // udot v18.4s, v15.16b, v2.4b[2]\n" + "add v21.4s, v21.4s, v17.4s\n" + "movi v17.4s, #0x0\n" + ".inst 0x6f81e1f1 // udot v17.4s, v15.16b, v1.4b[0]\n" + ".inst 0x6fa2e9d2 // udot v18.4s, v14.16b, v2.4b[3]\n" + "add v22.4s, v22.4s, v16.4s\n" + "movi v16.4s, #0x0\n" + ".inst 0x6fa1e1d1 // udot v17.4s, v14.16b, v1.4b[1]\n" + ".inst 0x6f81e9f0 // udot v16.4s, v15.16b, v1.4b[2]\n" + "add v23.4s, v23.4s, v18.4s\n" + "add v21.4s, v21.4s, v20.4s\n" + "add v22.4s, v22.4s, v19.4s\n" + ".inst 0x6fa1e9d0 // udot v16.4s, v14.16b, v1.4b[3]\n" + "add v21.4s, v21.4s, v18.4s\n" + "add v22.4s, v22.4s, v17.4s\n" + "neg v3.4s, v3.4s\n" + "add v21.4s, v21.4s, v16.4s\n" + "mul v28.4s, v28.4s, v3.4s\n" + "str q28, [SP, #0x0]\n" + "mul v27.4s, v27.4s, v3.4s\n" + "mul v26.4s, v26.4s, v3.4s\n" + "str q27, [SP, #0x10]\n" + "mul v25.4s, v25.4s, v3.4s\n" + "mul v24.4s, v24.4s, v3.4s\n" + "str q26, [SP, #0x20]\n" + "mul v23.4s, v23.4s, v3.4s\n" + "str q25, [SP, #0x30]\n" + "mul v22.4s, v22.4s, v3.4s\n" + "mul v21.4s, v21.4s, v3.4s\n" + "str q24, [SP, #0x40]\n" + "add v28.4s, v28.4s, v29.4s\n" + "str q23, [SP, #0x50]\n" + "add v27.4s, v27.4s, v29.4s\n" + "str q22, [SP, #0x60]\n" + "add v26.4s, v26.4s, v29.4s\n" + "add v25.4s, v25.4s, v29.4s\n" + "str q21, [SP, #0x70]\n" + "add v24.4s, v24.4s, v29.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "add v21.4s, v21.4s, v29.4s\n" + "ble 2f\n" + "1:" // Loop + ".inst 0x6f8de11c // udot v28.4s, v8.16b, v13.4b[0]\n" + "ldr q20, [%x[params], #0x60]\n" + "add x11, x11, #0x10\n" + ".inst 0x6f8de91b // udot v27.4s, v8.16b, v13.4b[2]\n" + "ldr q19, [%x[params], #0x70]\n" + "sub %x[n_channels], %x[n_channels], #0x4\n" + ".inst 0x6f8ce11a // udot v26.4s, v8.16b, v12.4b[0]\n" + "ldr q29, [%x[params], #0x80]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x6f8ce919 // udot v25.4s, v8.16b, v12.4b[2]\n" + ".inst 0x6f87e118 // udot v24.4s, v8.16b, v7.4b[0]\n" + ".inst 0x6f87e917 // udot v23.4s, v8.16b, v7.4b[2]\n" + ".inst 0x6f86e116 // udot v22.4s, v8.16b, v6.4b[0]\n" + ".inst 0x6f86e915 // udot v21.4s, v8.16b, v6.4b[2]\n" + "ldr q8, [%x[params], #0x0]\n" + ".inst 0x6fade13c // udot v28.4s, v9.16b, v13.4b[1]\n" + ".inst 0x6fade93b // udot v27.4s, v9.16b, v13.4b[3]\n" + ".inst 0x6face13a // udot v26.4s, v9.16b, v12.4b[1]\n" + ".inst 0x6face939 // udot v25.4s, v9.16b, v12.4b[3]\n" + ".inst 0x6fa7e138 // udot v24.4s, v9.16b, v7.4b[1]\n" + ".inst 0x6fa7e937 // udot v23.4s, v9.16b, v7.4b[3]\n" + ".inst 0x6fa6e136 // udot v22.4s, v9.16b, v6.4b[1]\n" + ".inst 0x6fa6e935 // udot v21.4s, v9.16b, v6.4b[3]\n" + "ldr q9, [%x[params], #0x10]\n" + ".inst 0x6f8ce15c // udot v28.4s, v10.16b, v12.4b[0]\n" + ".inst 0x6f8ce95b // udot v27.4s, v10.16b, v12.4b[2]\n" + ".inst 0x6f87e15a // udot v26.4s, v10.16b, v7.4b[0]\n" + ".inst 0x6f87e959 // udot v25.4s, v10.16b, v7.4b[2]\n" + ".inst 0x6f86e158 // udot v24.4s, v10.16b, v6.4b[0]\n" + ".inst 0x6f86e957 // udot v23.4s, v10.16b, v6.4b[2]\n" + ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + ".inst 0x6f85e955 // udot v21.4s, v10.16b, v5.4b[2]\n" + "ldr q10, [%x[params], #0x20]\n" + ".inst 0x6face17c // udot v28.4s, v11.16b, v12.4b[1]\n" + ".inst 0x6face97b // udot v27.4s, v11.16b, v12.4b[3]\n" + ".inst 0x6fa7e17a // udot v26.4s, v11.16b, v7.4b[1]\n" + ".inst 0x6fa7e979 // udot v25.4s, v11.16b, v7.4b[3]\n" + ".inst 0x6fa6e178 // udot v24.4s, v11.16b, v6.4b[1]\n" + ".inst 0x6fa6e977 // udot v23.4s, v11.16b, v6.4b[3]\n" + ".inst 0x6fa5e176 // udot v22.4s, v11.16b, v5.4b[1]\n" + ".inst 0x6fa5e975 // udot v21.4s, v11.16b, v5.4b[3]\n" + "ldr q11, [%x[params], #0x30]\n" + ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" + ".inst 0x6f87e91b // udot v27.4s, v8.16b, v7.4b[2]\n" + ".inst 0x6f86e11a // udot v26.4s, v8.16b, v6.4b[0]\n" + ".inst 0x6f86e919 // udot v25.4s, v8.16b, v6.4b[2]\n" + ".inst 0x6f85e118 // udot v24.4s, v8.16b, v5.4b[0]\n" + ".inst 0x6f85e917 // udot v23.4s, v8.16b, v5.4b[2]\n" + ".inst 0x6f84e116 // udot v22.4s, v8.16b, v4.4b[0]\n" + ".inst 0x6f84e915 // udot v21.4s, v8.16b, v4.4b[2]\n" + "ldr q8, [%x[params], #0x40]\n" + ".inst 0x6fa7e13c // udot v28.4s, v9.16b, v7.4b[1]\n" + ".inst 0x6fa7e93b // udot v27.4s, v9.16b, v7.4b[3]\n" + ".inst 0x6fa6e13a // udot v26.4s, v9.16b, v6.4b[1]\n" + ".inst 0x6fa6e939 // udot v25.4s, v9.16b, v6.4b[3]\n" + ".inst 0x6fa5e138 // udot v24.4s, v9.16b, v5.4b[1]\n" + ".inst 0x6fa5e937 // udot v23.4s, v9.16b, v5.4b[3]\n" + ".inst 0x6fa4e136 // udot v22.4s, v9.16b, v4.4b[1]\n" + ".inst 0x6fa4e935 // udot v21.4s, v9.16b, v4.4b[3]\n" + "ldr q9, [%x[params], #0x50]\n" + ".inst 0x6f86e15c // udot v28.4s, v10.16b, v6.4b[0]\n" + ".inst 0x6f86e95b // udot v27.4s, v10.16b, v6.4b[2]\n" + ".inst 0x6f85e15a // udot v26.4s, v10.16b, v5.4b[0]\n" + ".inst 0x6f85e959 // udot v25.4s, v10.16b, v5.4b[2]\n" + ".inst 0x6f84e158 // udot v24.4s, v10.16b, v4.4b[0]\n" + ".inst 0x6f84e957 // udot v23.4s, v10.16b, v4.4b[2]\n" + ".inst 0x6f82e156 // udot v22.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f82e955 // udot v21.4s, v10.16b, v2.4b[2]\n" + "ldr q10, [%x[params], #0xb0]\n" + ".inst 0x6fa6e17c // udot v28.4s, v11.16b, v6.4b[1]\n" + ".inst 0x6fa6e97b // udot v27.4s, v11.16b, v6.4b[3]\n" + ".inst 0x6fa5e17a // udot v26.4s, v11.16b, v5.4b[1]\n" + ".inst 0x6fa5e979 // udot v25.4s, v11.16b, v5.4b[3]\n" + ".inst 0x6fa4e178 // udot v24.4s, v11.16b, v4.4b[1]\n" + ".inst 0x6fa4e977 // udot v23.4s, v11.16b, v4.4b[3]\n" + ".inst 0x6fa2e176 // udot v22.4s, v11.16b, v2.4b[1]\n" + ".inst 0x6fa2e975 // udot v21.4s, v11.16b, v2.4b[3]\n" + "ldr q11, [%x[params], #0xc0]\n" + ".inst 0x6f85e11c // udot v28.4s, v8.16b, v5.4b[0]\n" + ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n" + ".inst 0x6f84e11a // udot v26.4s, v8.16b, v4.4b[0]\n" + ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n" + ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f82e917 // udot v23.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" + "ldr q8, [%x[params], #0x90]\n" + ".inst 0x6fa5e13c // udot v28.4s, v9.16b, v5.4b[1]\n" + ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n" + ".inst 0x6fa4e13a // udot v26.4s, v9.16b, v4.4b[1]\n" + ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n" + ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa2e937 // udot v23.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa1e136 // udot v22.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa1e935 // udot v21.4s, v9.16b, v1.4b[3]\n" + "ldr q9, [%x[params], #0xa0]\n" + "add %x[params], %x[params], #0xd0\n" + "sqrdmulh v28.4s, v28.4s, v20.4s\n" + "sqrdmulh v27.4s, v27.4s, v20.4s\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "sqrdmulh v24.4s, v24.4s, v20.4s\n" + "and v18.16b, v28.16b, v19.16b\n" + "and v17.16b, v27.16b, v19.16b\n" + "and v16.16b, v26.16b, v19.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v18.4s\n" + "sqadd v27.4s, v27.4s, v17.4s\n" + "sqadd v26.4s, v26.4s, v16.4s\n" + "and v16.16b, v25.16b, v19.16b\n" + "srshl v28.4s, v28.4s, v19.4s\n" + "srshl v27.4s, v27.4s, v19.4s\n" + "srshl v26.4s, v26.4s, v19.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "smin v28.4s, v28.4s, v30.4s\n" + "smin v27.4s, v27.4s, v30.4s\n" + "smin v26.4s, v26.4s, v30.4s\n" + "smax v28.4s, v28.4s, v31.4s\n" + "smax v27.4s, v27.4s, v31.4s\n" + "smax v26.4s, v26.4s, v31.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x26, x10]\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "ldr q28, [SP, #0x0]\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "str s27, [x25, x10]\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "ldr q27, [SP, #0x10]\n" + "and v16.16b, v24.16b, v19.16b\n" + "str s26, [x24, x10]\n" + "sqrdmulh v23.4s, v23.4s, v20.4s\n" + "ldr q26, [SP, #0x20]\n" + "srshl v25.4s, v25.4s, v19.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "and v17.16b, v23.16b, v19.16b\n" + "add v25.4s, v25.4s, v0.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v30.4s\n" + "and v16.16b, v22.16b, v19.16b\n" + "srshl v24.4s, v24.4s, v19.4s\n" + "smax v25.4s, v25.4s, v31.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v0.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x23, x10]\n" + "smin v24.4s, v24.4s, v30.4s\n" + "srshl v23.4s, v23.4s, v19.4s\n" + "ldr q25, [SP, #0x30]\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v24.4s, v24.4s, v31.4s\n" + "add v23.4s, v23.4s, v0.4s\n" + "srshl v22.4s, v22.4s, v19.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "smin v23.4s, v23.4s, v30.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x22, x10]\n" + "smax v23.4s, v23.4s, v31.4s\n" + "add v22.4s, v22.4s, v0.4s\n" + "ldr q24, [SP, #0x40]\n" + "and v16.16b, v21.16b, v19.16b\n" + "add v28.4s, v28.4s, v29.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v22.4s, v22.4s, v30.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x21, x10]\n" + "smax v22.4s, v22.4s, v31.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr q23, [SP, #0x50]\n" + "add v27.4s, v27.4s, v29.4s\n" + "add v26.4s, v26.4s, v29.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x20, x10]\n" + "add v25.4s, v25.4s, v29.4s\n" + "add v24.4s, v24.4s, v29.4s\n" + "ldr q22, [SP, #0x60]\n" + "srshl v21.4s, v21.4s, v19.4s\n" + "add v23.4s, v23.4s, v29.4s\n" + "add v21.4s, v21.4s, v0.4s\n" + "add v22.4s, v22.4s, v29.4s\n" + "smin v21.4s, v21.4s, v30.4s\n" + "smax v21.4s, v21.4s, v31.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x19, x10]\n" + "add x10, x10, #0x4\n" + "ldr q21, [SP, #0x70]\n" + "add v21.4s, v21.4s, v29.4s\n" + "bgt 1b\n" + "2:" // Tail + ".inst 0x6f8de11c // udot v28.4s, v8.16b, v13.4b[0]\n" + "ldr q20, [%x[params], #0x60]\n" + "add x26, x26, x10\n" + ".inst 0x6f8de91b // udot v27.4s, v8.16b, v13.4b[2]\n" + "ldr q19, [%x[params], #0x70]\n" + "add x25, x25, x10\n" + ".inst 0x6f8ce11a // udot v26.4s, v8.16b, v12.4b[0]\n" + "add x24, x24, x10\n" + ".inst 0x6f8ce919 // udot v25.4s, v8.16b, v12.4b[2]\n" + "add x23, x23, x10\n" + ".inst 0x6f87e118 // udot v24.4s, v8.16b, v7.4b[0]\n" + "add x22, x22, x10\n" + ".inst 0x6f87e917 // udot v23.4s, v8.16b, v7.4b[2]\n" + "add x21, x21, x10\n" + ".inst 0x6f86e116 // udot v22.4s, v8.16b, v6.4b[0]\n" + "add x20, x20, x10\n" + ".inst 0x6f86e915 // udot v21.4s, v8.16b, v6.4b[2]\n" + "ldr q8, [%x[params], #0x0]\n" + "add x19, x19, x10\n" + ".inst 0x6fade13c // udot v28.4s, v9.16b, v13.4b[1]\n" + "cmp %x[n_channels], #0x4\n" + ".inst 0x6fade93b // udot v27.4s, v9.16b, v13.4b[3]\n" + ".inst 0x6face13a // udot v26.4s, v9.16b, v12.4b[1]\n" + ".inst 0x6face939 // udot v25.4s, v9.16b, v12.4b[3]\n" + ".inst 0x6fa7e138 // udot v24.4s, v9.16b, v7.4b[1]\n" + ".inst 0x6fa7e937 // udot v23.4s, v9.16b, v7.4b[3]\n" + ".inst 0x6fa6e136 // udot v22.4s, v9.16b, v6.4b[1]\n" + ".inst 0x6fa6e935 // udot v21.4s, v9.16b, v6.4b[3]\n" + "ldr q9, [%x[params], #0x10]\n" + ".inst 0x6f8ce15c // udot v28.4s, v10.16b, v12.4b[0]\n" + ".inst 0x6f8ce95b // udot v27.4s, v10.16b, v12.4b[2]\n" + ".inst 0x6f87e15a // udot v26.4s, v10.16b, v7.4b[0]\n" + ".inst 0x6f87e959 // udot v25.4s, v10.16b, v7.4b[2]\n" + ".inst 0x6f86e158 // udot v24.4s, v10.16b, v6.4b[0]\n" + ".inst 0x6f86e957 // udot v23.4s, v10.16b, v6.4b[2]\n" + ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" + ".inst 0x6f85e955 // udot v21.4s, v10.16b, v5.4b[2]\n" + "ldr q10, [%x[params], #0x20]\n" + ".inst 0x6face17c // udot v28.4s, v11.16b, v12.4b[1]\n" + ".inst 0x6face97b // udot v27.4s, v11.16b, v12.4b[3]\n" + ".inst 0x6fa7e17a // udot v26.4s, v11.16b, v7.4b[1]\n" + ".inst 0x6fa7e979 // udot v25.4s, v11.16b, v7.4b[3]\n" + ".inst 0x6fa6e178 // udot v24.4s, v11.16b, v6.4b[1]\n" + ".inst 0x6fa6e977 // udot v23.4s, v11.16b, v6.4b[3]\n" + ".inst 0x6fa5e176 // udot v22.4s, v11.16b, v5.4b[1]\n" + ".inst 0x6fa5e975 // udot v21.4s, v11.16b, v5.4b[3]\n" + "ldr q11, [%x[params], #0x30]\n" + ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" + ".inst 0x6f87e91b // udot v27.4s, v8.16b, v7.4b[2]\n" + ".inst 0x6f86e11a // udot v26.4s, v8.16b, v6.4b[0]\n" + ".inst 0x6f86e919 // udot v25.4s, v8.16b, v6.4b[2]\n" + ".inst 0x6f85e118 // udot v24.4s, v8.16b, v5.4b[0]\n" + ".inst 0x6f85e917 // udot v23.4s, v8.16b, v5.4b[2]\n" + ".inst 0x6f84e116 // udot v22.4s, v8.16b, v4.4b[0]\n" + ".inst 0x6f84e915 // udot v21.4s, v8.16b, v4.4b[2]\n" + "ldr q8, [%x[params], #0x40]\n" + ".inst 0x6fa7e13c // udot v28.4s, v9.16b, v7.4b[1]\n" + ".inst 0x6fa7e93b // udot v27.4s, v9.16b, v7.4b[3]\n" + ".inst 0x6fa6e13a // udot v26.4s, v9.16b, v6.4b[1]\n" + ".inst 0x6fa6e939 // udot v25.4s, v9.16b, v6.4b[3]\n" + ".inst 0x6fa5e138 // udot v24.4s, v9.16b, v5.4b[1]\n" + ".inst 0x6fa5e937 // udot v23.4s, v9.16b, v5.4b[3]\n" + ".inst 0x6fa4e136 // udot v22.4s, v9.16b, v4.4b[1]\n" + ".inst 0x6fa4e935 // udot v21.4s, v9.16b, v4.4b[3]\n" + "ldr q9, [%x[params], #0x50]\n" + "add %x[params], %x[params], #0x80\n" + ".inst 0x6f86e15c // udot v28.4s, v10.16b, v6.4b[0]\n" + ".inst 0x6f86e95b // udot v27.4s, v10.16b, v6.4b[2]\n" + ".inst 0x6f85e15a // udot v26.4s, v10.16b, v5.4b[0]\n" + ".inst 0x6f85e959 // udot v25.4s, v10.16b, v5.4b[2]\n" + ".inst 0x6f84e158 // udot v24.4s, v10.16b, v4.4b[0]\n" + ".inst 0x6f84e957 // udot v23.4s, v10.16b, v4.4b[2]\n" + ".inst 0x6f82e156 // udot v22.4s, v10.16b, v2.4b[0]\n" + ".inst 0x6f82e955 // udot v21.4s, v10.16b, v2.4b[2]\n" + ".inst 0x6fa6e17c // udot v28.4s, v11.16b, v6.4b[1]\n" + ".inst 0x6fa6e97b // udot v27.4s, v11.16b, v6.4b[3]\n" + ".inst 0x6fa5e17a // udot v26.4s, v11.16b, v5.4b[1]\n" + ".inst 0x6fa5e979 // udot v25.4s, v11.16b, v5.4b[3]\n" + ".inst 0x6fa4e178 // udot v24.4s, v11.16b, v4.4b[1]\n" + ".inst 0x6fa4e977 // udot v23.4s, v11.16b, v4.4b[3]\n" + ".inst 0x6fa2e176 // udot v22.4s, v11.16b, v2.4b[1]\n" + ".inst 0x6fa2e975 // udot v21.4s, v11.16b, v2.4b[3]\n" + ".inst 0x6f85e11c // udot v28.4s, v8.16b, v5.4b[0]\n" + ".inst 0x6f85e91b // udot v27.4s, v8.16b, v5.4b[2]\n" + ".inst 0x6f84e11a // udot v26.4s, v8.16b, v4.4b[0]\n" + ".inst 0x6f84e919 // udot v25.4s, v8.16b, v4.4b[2]\n" + ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" + ".inst 0x6f82e917 // udot v23.4s, v8.16b, v2.4b[2]\n" + ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n" + ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n" + ".inst 0x6fa5e13c // udot v28.4s, v9.16b, v5.4b[1]\n" + ".inst 0x6fa5e93b // udot v27.4s, v9.16b, v5.4b[3]\n" + ".inst 0x6fa4e13a // udot v26.4s, v9.16b, v4.4b[1]\n" + ".inst 0x6fa4e939 // udot v25.4s, v9.16b, v4.4b[3]\n" + ".inst 0x6fa2e138 // udot v24.4s, v9.16b, v2.4b[1]\n" + ".inst 0x6fa2e937 // udot v23.4s, v9.16b, v2.4b[3]\n" + ".inst 0x6fa1e136 // udot v22.4s, v9.16b, v1.4b[1]\n" + ".inst 0x6fa1e935 // udot v21.4s, v9.16b, v1.4b[3]\n" + "sqrdmulh v28.4s, v28.4s, v20.4s\n" + "sqrdmulh v27.4s, v27.4s, v20.4s\n" + "sqrdmulh v26.4s, v26.4s, v20.4s\n" + "sqrdmulh v25.4s, v25.4s, v20.4s\n" + "and v18.16b, v28.16b, v19.16b\n" + "and v17.16b, v27.16b, v19.16b\n" + "and v16.16b, v26.16b, v19.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v28.4s, v28.4s, v18.4s\n" + "sqadd v27.4s, v27.4s, v17.4s\n" + "sqadd v26.4s, v26.4s, v16.4s\n" + "and v16.16b, v25.16b, v19.16b\n" + "srshl v28.4s, v28.4s, v19.4s\n" + "srshl v27.4s, v27.4s, v19.4s\n" + "srshl v26.4s, v26.4s, v19.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v28.4s, v28.4s, v0.4s\n" + "add v27.4s, v27.4s, v0.4s\n" + "add v26.4s, v26.4s, v0.4s\n" + "smin v28.4s, v28.4s, v30.4s\n" + "smin v27.4s, v27.4s, v30.4s\n" + "smin v26.4s, v26.4s, v30.4s\n" + "smax v28.4s, v28.4s, v31.4s\n" + "smax v27.4s, v27.4s, v31.4s\n" + "smax v26.4s, v26.4s, v31.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sqrdmulh v24.4s, v24.4s, v20.4s\n" + "sqrdmulh v23.4s, v23.4s, v20.4s\n" + "srshl v25.4s, v25.4s, v19.4s\n" + "sqrdmulh v22.4s, v22.4s, v20.4s\n" + "and v16.16b, v24.16b, v19.16b\n" + "and v17.16b, v23.16b, v19.16b\n" + "add v25.4s, v25.4s, v0.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smin v25.4s, v25.4s, v30.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "smax v25.4s, v25.4s, v31.4s\n" + "and v16.16b, v22.16b, v19.16b\n" + "srshl v24.4s, v24.4s, v19.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "srshl v23.4s, v23.4s, v19.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v0.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v23.4s, v23.4s, v0.4s\n" + "smin v24.4s, v24.4s, v30.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "smin v23.4s, v23.4s, v30.4s\n" + "smax v24.4s, v24.4s, v31.4s\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "smax v23.4s, v23.4s, v31.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "srshl v22.4s, v22.4s, v19.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "and v16.16b, v21.16b, v19.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "add v22.4s, v22.4s, v0.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v22.4s, v22.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v22.4s, v22.4s, v31.4s\n" + "srshl v21.4s, v21.4s, v19.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "add v21.4s, v21.4s, v0.4s\n" + "smin v21.4s, v21.4s, v30.4s\n" + "smax v21.4s, v21.4s, v31.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "blt 3f\n" + "str s28, [x26, #0x0]\n" + "str s27, [x25, #0x0]\n" + "str s26, [x24, #0x0]\n" + "str s25, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "str s23, [x21, #0x0]\n" + "str s22, [x20, #0x0]\n" + "str s21, [x19, #0x0]\n" + "b 4f\n" + "3:" // Tail: Oddments + "st1 { v28.b }[0], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[0], [x25], #0x1\n" + "st1 { v26.b }[0], [x24], #0x1\n" + "st1 { v25.b }[0], [x23], #0x1\n" + "st1 { v24.b }[0], [x22], #0x1\n" + "st1 { v23.b }[0], [x21], #0x1\n" + "st1 { v22.b }[0], [x20], #0x1\n" + "st1 { v21.b }[0], [x19], #0x1\n" + "beq 4f\n" + "st1 { v28.b }[1], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[1], [x25], #0x1\n" + "st1 { v26.b }[1], [x24], #0x1\n" + "st1 { v25.b }[1], [x23], #0x1\n" + "st1 { v24.b }[1], [x22], #0x1\n" + "st1 { v23.b }[1], [x21], #0x1\n" + "st1 { v22.b }[1], [x20], #0x1\n" + "st1 { v21.b }[1], [x19], #0x1\n" + "beq 4f\n" + "st1 { v28.b }[2], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[2], [x25], #0x1\n" + "st1 { v26.b }[2], [x24], #0x1\n" + "st1 { v25.b }[2], [x23], #0x1\n" + "st1 { v24.b }[2], [x22], #0x1\n" + "st1 { v23.b }[2], [x21], #0x1\n" + "st1 { v22.b }[2], [x20], #0x1\n" + "st1 { v21.b }[2], [x19], #0x1\n" + "beq 4f\n" + "st1 { v28.b }[3], [x26], #0x1\n" + "subs %x[n_channels], %x[n_channels], #0x1\n" + "st1 { v27.b }[3], [x25], #0x1\n" + "st1 { v26.b }[3], [x24], #0x1\n" + "st1 { v25.b }[3], [x23], #0x1\n" + "st1 { v24.b }[3], [x22], #0x1\n" + "st1 { v23.b }[3], [x21], #0x1\n" + "st1 { v22.b }[3], [x20], #0x1\n" + "st1 { v21.b }[3], [x19], #0x1\n" + "4:" // Tail: End + "add SP, SP, #0x80\n" + : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params) + : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp new file mode 100644 index 0000000000..6b52017ce1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); + +struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int output_rows(void) { return 2; }; + constexpr static unsigned int output_cols(void) { return 8; }; + + constexpr static unsigned int output_col_regs(void) { return 2; }; + + kern_type kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + + a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..ada1818eba --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -0,0 +1,1484 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const uint8_t *weights, + const int32_t *bias, + const unsigned int kernel_points, + const unsigned int n_output_channels, + const int32_t *per_channel_left_shifts, + const int32_t *per_channel_muls, + const int32_t *per_channel_right_shifts, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov x9, #0x0\n" + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v14.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n" + "ld1r { v13.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n" + "ld1r { v12.16b }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "ld1r { v11.16b }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v10.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n" + "ld1r { v9.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n" + "ld1r { v8.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n" + "ld1r { v7.4s }, [x19]\n" + "lsr x28, %x[n_output_channels], #0x2\n" + "cbz x28, 9f\n" + "1:" // Output channel loop + "movi v16.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x9, #0x2\n" + "ldr q16, [%x[bias], x19]\n" + "2:" // Output channel loop: Load bias: Done + "mov v6.16b, v16.16b\n" + "mov v5.16b, v16.16b\n" + "mov v4.16b, v16.16b\n" + "mov v31.16b, v16.16b\n" + "mov v30.16b, v16.16b\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "cbz %x[rq_mul_ptr], 3f\n" + "lsl x19, x9, #0x2\n" + "ldr q8, [%x[rq_mul_ptr], x19]\n" + "ldr q7, [%x[rq_right_shift_ptr], x19]\n" + "cbz %x[rq_left_shift_ptr], 3f\n" + "ldr q9, [%x[rq_left_shift_ptr], x19]\n" + "3:" // Output channel loop: Load quantization parameters: Done + "ldr s17, [%x[weights]], #0x4\n" + "usubl v17.8h, v17.8b, v11.8b\n" + "mov x19, %x[inptrs]\n" + "ldp x25, x27, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "ldr d3, [x25, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr d2, [x27, #0x0]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "cbz x20, 7f\n" + "ldp x25, x27, [x19], #0x10\n" + "ldr s16, [%x[weights]], #0x4\n" + "usubl v16.8h, v16.8b, v11.8b\n" + "ldr d1, [x25, #0x0]\n" + "subs x20, x20, #0x1\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr d0, [x27, #0x0]\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "beq 5f\n" + "4:" // Output channel loop: Kernel loop + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "usubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "ldr d1, [x25, #0x0]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "ldr d0, [x27, #0x0]\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr s16, [%x[weights]], #0x4\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "usubl v16.8h, v16.8b, v11.8b\n" + "bgt 4b\n" + "5:" // Output channel loop: Kernel loop tail + "tbnz %x[kernel_points], #0, 6f\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "b 8f\n" + "6:" // Output channel loop: Odd tail + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "usubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "b 8f\n" + "7:" // Output channel loop: Single kernel point + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "8:" // Output channel loop: Done + "add x9, x9, #0x4\n" + "cmp x9, x28, LSL #2\n" + "blt 1b\n" + "tst %x[n_output_channels], #0x3\n" + "beq 26f\n" + "9:" // Output channel oddments + "movi v16.4s, #0x0\n" + "cbz %x[bias], 12f\n" + "add x19, %x[bias], x9, LSL #2\n" + "tbz %x[n_output_channels], #1, 10f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 11f\n" + "ld1 { v16.s }[2], [x19]\n" + "b 11f\n" + "10:" // Output channel oddments: Load bias: Bit 1: Unset + "tbz %x[n_output_channels], #0, 11f\n" + "ld1 { v16.s }[0], [x19]\n" + "11:" // Output channel oddments: Load bias: Bit 1: End + + "12:" // Output channel oddments: Load bias: Done + "mov v6.16b, v16.16b\n" + "mov v5.16b, v16.16b\n" + "mov v4.16b, v16.16b\n" + "mov v31.16b, v16.16b\n" + "mov v30.16b, v16.16b\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "cbz %x[rq_mul_ptr], 18f\n" + "add x21, %x[rq_mul_ptr], x9, LSL #2\n" + "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n" + "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n" + "cbz %x[rq_left_shift_ptr], 15f\n" + "tbz %x[n_output_channels], #1, 13f\n" + "ld1 { v8.d }[0], [x21], #0x8\n" + "ld1 { v7.d }[0], [x20], #0x8\n" + "ld1 { v9.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 14f\n" + "ld1 { v8.s }[2], [x21], #0x4\n" + "ld1 { v7.s }[2], [x20], #0x4\n" + "ld1 { v9.s }[2], [x19], #0x4\n" + "b 14f\n" + "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset + "tbz %x[n_output_channels], #0, 14f\n" + "ld1 { v8.s }[0], [x21], #0x4\n" + "ld1 { v7.s }[0], [x20], #0x4\n" + "ld1 { v9.s }[0], [x19], #0x4\n" + "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End + "b 18f\n" + "15:" // Output channel oddments: Load quantization parameters: No left shift + "tbz %x[n_output_channels], #1, 16f\n" + "ld1 { v8.d }[0], [x21], #0x8\n" + "ld1 { v7.d }[0], [x20], #0x8\n" + "tbz %x[n_output_channels], #0, 17f\n" + "ld1 { v8.s }[2], [x21], #0x4\n" + "ld1 { v7.s }[2], [x20], #0x4\n" + "b 17f\n" + "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset + "tbz %x[n_output_channels], #0, 17f\n" + "ld1 { v8.s }[0], [x21], #0x4\n" + "ld1 { v7.s }[0], [x20], #0x4\n" + "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End + + "18:" // Output channel oddments: Load quantization parameters: Done + "ldr s17, [%x[weights]], #0x4\n" + "usubl v17.8h, v17.8b, v11.8b\n" + "mov x19, %x[inptrs]\n" + "ldp x25, x27, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "ldr d3, [x25, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr d2, [x27, #0x0]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "cbz x20, 22f\n" + "ldp x25, x27, [x19], #0x10\n" + "ldr s16, [%x[weights]], #0x4\n" + "usubl v16.8h, v16.8b, v11.8b\n" + "ldr d1, [x25, #0x0]\n" + "subs x20, x20, #0x1\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr d0, [x27, #0x0]\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "beq 20f\n" + "19:" // Output channel oddments: Kernel loop + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "usubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "ldr d1, [x25, #0x0]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "ldr d0, [x27, #0x0]\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr s16, [%x[weights]], #0x4\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "usubl v16.8h, v16.8b, v11.8b\n" + "bgt 19b\n" + "20:" // Output channel oddments: Kernel loop tail + "tbnz %x[kernel_points], #0, 21f\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "b 23f\n" + "21:" // Output channel oddments: Odd tail + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "usubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "b 23f\n" + "22:" // Output channel oddments: Single kernel point + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "23:" // Output channel oddments: Done + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "and v17.16b, v4.16b, v7.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "and v16.16b, v31.16b, v7.16b\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "add v4.4s, v4.4s, v10.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "smin v4.4s, v4.4s, v13.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "and v16.16b, v30.16b, v7.16b\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "and v17.16b, v29.16b, v7.16b\n" + "smin v31.4s, v31.4s, v13.4s\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smax v31.4s, v31.4s, v14.4s\n" + "and v16.16b, v28.16b, v7.16b\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "add v29.4s, v29.4s, v10.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smin v29.4s, v29.4s, v13.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v28.4s, v28.4s, v10.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smax v28.4s, v28.4s, v14.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "and v17.16b, v26.16b, v7.16b\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "and v16.16b, v25.16b, v7.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v27.4s, v27.4s, v13.4s\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "add v26.4s, v26.4s, v10.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "and v17.16b, v24.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "smin v25.4s, v25.4s, v13.4s\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "smax v25.4s, v25.4s, v14.4s\n" + "and v16.16b, v23.16b, v7.16b\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v24.4s, v24.4s, v13.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "and v16.16b, v21.16b, v7.16b\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "smax v23.4s, v23.4s, v14.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v22.4s, v22.4s, v13.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "add v21.4s, v21.4s, v10.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "smax v21.4s, v21.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "smin v20.4s, v20.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "tbz %x[n_output_channels], #1, 24f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.h }[0], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.h }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.h }[0], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.h }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.h }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.h }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.h }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.h }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.h }[0], [x25]\n" + "add x9, x9, #0x2\n" + "st1 { v19.h }[0], [x26]\n" + "tbz %x[n_output_channels], #0, 25f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.b }[2], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.b }[2], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.b }[2], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.b }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.b }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.b }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.b }[2], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.b }[2], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.b }[2], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.b }[2], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.b }[2], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.b }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.b }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.b }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v19.b }[2], [x26]\n" + "b 25f\n" + "24:" // Output channel oddments: Done: Store: Bit 1: Unset + "tbz %x[n_output_channels], #0, 25f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.b }[0], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.b }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.b }[0], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.b }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.b }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.b }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.b }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.b }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.b }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.b }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.b }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.b }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.b }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.b }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.b }[0], [x25]\n" + "st1 { v19.b }[0], [x26]\n" + "25:" // Output channel oddments: Done: Store: Bit 1: End + + "26:" // Done + + : [weights] "+&r" (weights) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..1bacb5ffe7 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + + kern_type kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + + a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..8cbbfae00d --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1192 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x17, #0x0\n" + "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "mov x15, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x14, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x12, x8, #0x3\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v21.16b }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v17.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v13.4s }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v15.4s }, [x20]\n" + "ld1r { v14.4s }, [x19]\n" + "ldp x10, x9, [x21, #0x0]\n" + "ldp x28, x27, [x21, #0x10]\n" + "cbz x12, 3f\n" + "subs x12, x12, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q11, [x19, #0x0]\n" + "mov v23.16b, v11.16b\n" + "ldr q26, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v12.16b, v11.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v24.16b, v11.16b\n" + "ldr d0, [x16, #0x0]\n" + "ldr d1, [x16, #0x8]\n" + "mov v9.16b, v26.16b\n" + "ldr d2, [x16, #0x10]\n" + "mov v22.16b, v26.16b\n" + "ldr d3, [x16, #0x18]\n" + "mov v10.16b, v26.16b\n" + "ldr d4, [x16, #0x20]\n" + "ssubl v0.8h, v0.8b, v17.8b\n" + "ldr d5, [x16, #0x28]\n" + "ssubl v1.8h, v1.8b, v17.8b\n" + "ldr d6, [x16, #0x30]\n" + "ssubl v2.8h, v2.8b, v17.8b\n" + "ldr d7, [x16, #0x38]\n" + "ssubl v3.8h, v3.8b, v17.8b\n" + "ldr d8, [x16, #0x40]\n" + "ssubl v4.8h, v4.8b, v17.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "ssubl v5.8h, v5.8b, v17.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "ssubl v6.8h, v6.8b, v17.8b\n" + "ssubl v7.8h, v7.8b, v17.8b\n" + "ldr x19, [x14, #0x20]\n" + "ssubl v8.8h, v8.8b, v17.8b\n" + "ldr d31, [x23, x17]\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "ldr d30, [x22, x17]\n" + "ldr d29, [x21, x17]\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "ldr d28, [x20, x17]\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "ldr d27, [x19, x17]\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "usubl v27.8h, v27.8b, v21.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v11.4s, v31.4h, v4.4h\n" + "ldr x21, [x14, #0x28]\n" + "add x16, x16, #0x48\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x20, [x14, #0x30]\n" + "subs x12, x12, #0x1\n" + "smlal v23.4s, v31.4h, v3.4h\n" + "ldr x26, [x14, #0x38]\n" + "smlal2 v9.4s, v31.8h, v3.8h\n" + "ldr x25, [x14, #0x40]\n" + "smlal v12.4s, v31.4h, v1.4h\n" + "ldr x19, [x14, #0x48]\n" + "smlal2 v22.4s, v31.8h, v1.8h\n" + "ldr x24, [x14, #0x50]\n" + "smlal v24.4s, v31.4h, v0.4h\n" + "ldr x23, [x14, #0x58]\n" + "smlal2 v10.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x17]\n" + "smlal v11.4s, v30.4h, v0.4h\n" + "ldr x22, [x14, #0x60]\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ldr d30, [x19, x17]\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr x21, [x14, #0x68]\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x17]\n" + "smlal v11.4s, v28.4h, v5.4h\n" + "ldr x20, [x14, #0x70]\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "ldr x19, [x14, #0x78]\n" + "smlal v23.4s, v28.4h, v4.4h\n" + "ldr q25, [x13, #0x0]\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr q18, [x11, #0x0]\n" + "smlal v12.4s, v28.4h, v2.4h\n" + "ldr q16, [x13, #0x10]\n" + "add x13, x13, #0x20\n" + "smlal2 v22.4s, v28.8h, v2.8h\n" + "ldr q20, [x11, #0x10]\n" + "add x11, x11, #0x20\n" + "smlal v24.4s, v28.4h, v1.4h\n" + "smlal2 v10.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x17]\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "smlal v12.4s, v31.4h, v6.4h\n" + "smlal2 v22.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x17]\n" + "smlal v23.4s, v27.4h, v6.4h\n" + "smlal2 v9.4s, v27.8h, v6.8h\n" + "smlal v12.4s, v27.4h, v4.4h\n" + "smlal2 v22.4s, v27.8h, v4.8h\n" + "smlal v24.4s, v27.4h, v3.4h\n" + "smlal2 v10.4s, v27.8h, v3.8h\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "smlal v24.4s, v29.4h, v8.4h\n" + "smlal2 v10.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x17]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v28.4h, v0.4h\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x17]\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x17]\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "smlal v11.4s, v30.4h, v8.4h\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v7.4h\n" + "smlal2 v9.4s, v30.8h, v7.8h\n" + "smlal v12.4s, v30.4h, v5.4h\n" + "smlal2 v22.4s, v30.8h, v5.8h\n" + "smlal v24.4s, v30.4h, v4.4h\n" + "smlal2 v10.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x17]\n" + "smlal v11.4s, v29.4h, v3.4h\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal v12.4s, v29.4h, v0.4h\n" + "smlal2 v22.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x17]\n" + "smlal v23.4s, v28.4h, v5.4h\n" + "smlal2 v9.4s, v28.8h, v5.8h\n" + "smlal v24.4s, v28.4h, v2.4h\n" + "smlal2 v10.4s, v28.8h, v2.8h\n" + "ldr d28, [x19, x17]\n" + "add x17, x17, #0x8\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "smlal v11.4s, v31.4h, v6.4h\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal v12.4s, v31.4h, v3.4h\n" + "smlal2 v22.4s, v31.8h, v3.8h\n" + "smlal v23.4s, v30.4h, v8.4h\n" + "smlal2 v9.4s, v30.8h, v8.8h\n" + "smlal v24.4s, v30.4h, v5.4h\n" + "smlal2 v10.4s, v30.8h, v5.8h\n" + "smlal v12.4s, v29.4h, v7.4h\n" + "smlal2 v22.4s, v29.8h, v7.8h\n" + "smlal v24.4s, v29.4h, v6.4h\n" + "smlal2 v10.4s, v29.8h, v6.8h\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "sqrdmulh v11.4s, v11.4s, v25.4s\n" + "sqrdmulh v26.4s, v26.4s, v16.4s\n" + "smlal v12.4s, v28.4h, v8.4h\n" + "smlal2 v22.4s, v28.8h, v8.8h\n" + "smlal v24.4s, v28.4h, v7.4h\n" + "smlal2 v10.4s, v28.8h, v7.8h\n" + "and v19.16b, v11.16b, v18.16b\n" + "and v5.16b, v26.16b, v20.16b\n" + "sqrdmulh v23.4s, v23.4s, v25.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqrdmulh v9.4s, v9.4s, v16.4s\n" + "sqadd v11.4s, v11.4s, v19.4s\n" + "sqadd v26.4s, v26.4s, v5.4s\n" + "and v28.16b, v23.16b, v18.16b\n" + "and v8.16b, v9.16b, v20.16b\n" + "srshl v11.4s, v11.4s, v18.4s\n" + "srshl v26.4s, v26.4s, v20.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "add v11.4s, v11.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v28.4s\n" + "smin v11.4s, v11.4s, v14.4s\n" + "smin v26.4s, v26.4s, v14.4s\n" + "sqadd v9.4s, v9.4s, v8.4s\n" + "smax v11.4s, v11.4s, v15.4s\n" + "smax v26.4s, v26.4s, v15.4s\n" + "srshl v23.4s, v23.4s, v18.4s\n" + "srshl v9.4s, v9.4s, v20.4s\n" + "uzp1 v11.16b, v11.16b, v26.16b\n" + "sqrdmulh v12.4s, v12.4s, v25.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x10, x15]\n" + "add v23.4s, v23.4s, v13.4s\n" + "add v9.4s, v9.4s, v13.4s\n" + "and v1.16b, v12.16b, v18.16b\n" + "sqrdmulh v22.4s, v22.4s, v16.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "smin v9.4s, v9.4s, v14.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "smax v23.4s, v23.4s, v15.4s\n" + "smax v9.4s, v9.4s, v15.4s\n" + "sqadd v12.4s, v12.4s, v1.4s\n" + "and v0.16b, v22.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v9.16b\n" + "sqrdmulh v24.4s, v24.4s, v25.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d23, [x9, x15]\n" + "srshl v12.4s, v12.4s, v18.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "and v26.16b, v24.16b, v18.16b\n" + "sqrdmulh v10.4s, v10.4s, v16.4s\n" + "sqadd v22.4s, v22.4s, v0.4s\n" + "add v12.4s, v12.4s, v13.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v16.16b, v10.16b, v20.16b\n" + "smin v12.4s, v12.4s, v14.4s\n" + "srshl v22.4s, v22.4s, v20.4s\n" + "sqadd v24.4s, v24.4s, v26.4s\n" + "smax v12.4s, v12.4s, v15.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v24.4s, v24.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v16.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "smax v22.4s, v22.4s, v15.4s\n" + "srshl v10.4s, v10.4s, v20.4s\n" + "smin v24.4s, v24.4s, v14.4s\n" + "uzp1 v12.16b, v12.16b, v22.16b\n" + "add v10.4s, v10.4s, v13.4s\n" + "uzp1 v12.16b, v12.16b, v12.16b\n" + "str d12, [x28, x15]\n" + "smax v24.4s, v24.4s, v15.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "smax v10.4s, v10.4s, v15.4s\n" + "uzp1 v24.16b, v24.16b, v10.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str d24, [x27, x15]\n" + "add x15, x15, #0x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q11, [x19, #0x0]\n" + "mov v23.16b, v11.16b\n" + "ldr q26, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v12.16b, v11.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v24.16b, v11.16b\n" + "ldr d0, [x16, #0x0]\n" + "ldr d1, [x16, #0x8]\n" + "mov v9.16b, v26.16b\n" + "ldr d2, [x16, #0x10]\n" + "mov v22.16b, v26.16b\n" + "ldr d3, [x16, #0x18]\n" + "mov v10.16b, v26.16b\n" + "ldr d4, [x16, #0x20]\n" + "ssubl v0.8h, v0.8b, v17.8b\n" + "ldr d5, [x16, #0x28]\n" + "ssubl v1.8h, v1.8b, v17.8b\n" + "ldr d6, [x16, #0x30]\n" + "ssubl v2.8h, v2.8b, v17.8b\n" + "ldr d7, [x16, #0x38]\n" + "ssubl v3.8h, v3.8b, v17.8b\n" + "ldr d8, [x16, #0x40]\n" + "ssubl v4.8h, v4.8b, v17.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "ssubl v5.8h, v5.8b, v17.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "ssubl v6.8h, v6.8b, v17.8b\n" + "ssubl v7.8h, v7.8b, v17.8b\n" + "ldr x19, [x14, #0x20]\n" + "ssubl v8.8h, v8.8b, v17.8b\n" + "ldr d31, [x23, x17]\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "ldr d30, [x22, x17]\n" + "ldr d29, [x21, x17]\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "ldr d28, [x20, x17]\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "ldr d27, [x19, x17]\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "usubl v27.8h, v27.8b, v21.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v11.4s, v31.4h, v4.4h\n" + "ldr x21, [x14, #0x28]\n" + "tst x8, #0x7\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x20, [x14, #0x30]\n" + "smlal v23.4s, v31.4h, v3.4h\n" + "ldr x26, [x14, #0x38]\n" + "smlal2 v9.4s, v31.8h, v3.8h\n" + "ldr x25, [x14, #0x40]\n" + "smlal v12.4s, v31.4h, v1.4h\n" + "ldr x19, [x14, #0x48]\n" + "smlal2 v22.4s, v31.8h, v1.8h\n" + "ldr x24, [x14, #0x50]\n" + "smlal v24.4s, v31.4h, v0.4h\n" + "ldr x23, [x14, #0x58]\n" + "smlal2 v10.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x17]\n" + "smlal v11.4s, v30.4h, v0.4h\n" + "ldr x22, [x14, #0x60]\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ldr d30, [x19, x17]\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr x21, [x14, #0x68]\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x17]\n" + "smlal v11.4s, v28.4h, v5.4h\n" + "ldr x20, [x14, #0x70]\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "ldr x19, [x14, #0x78]\n" + "smlal v23.4s, v28.4h, v4.4h\n" + "ldr q25, [x13, #0x0]\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr q18, [x11, #0x0]\n" + "smlal v12.4s, v28.4h, v2.4h\n" + "ldr q16, [x13, #0x10]\n" + "add x13, x13, #0x20\n" + "smlal2 v22.4s, v28.8h, v2.8h\n" + "ldr q20, [x11, #0x10]\n" + "add x11, x11, #0x20\n" + "smlal v24.4s, v28.4h, v1.4h\n" + "smlal2 v10.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x17]\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "smlal v12.4s, v31.4h, v6.4h\n" + "smlal2 v22.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x17]\n" + "smlal v23.4s, v27.4h, v6.4h\n" + "smlal2 v9.4s, v27.8h, v6.8h\n" + "smlal v12.4s, v27.4h, v4.4h\n" + "smlal2 v22.4s, v27.8h, v4.8h\n" + "smlal v24.4s, v27.4h, v3.4h\n" + "smlal2 v10.4s, v27.8h, v3.8h\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "smlal v24.4s, v29.4h, v8.4h\n" + "smlal2 v10.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x17]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v28.4h, v0.4h\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x17]\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x17]\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "smlal v11.4s, v30.4h, v8.4h\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v7.4h\n" + "smlal2 v9.4s, v30.8h, v7.8h\n" + "smlal v12.4s, v30.4h, v5.4h\n" + "smlal2 v22.4s, v30.8h, v5.8h\n" + "smlal v24.4s, v30.4h, v4.4h\n" + "smlal2 v10.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x17]\n" + "smlal v11.4s, v29.4h, v3.4h\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal v12.4s, v29.4h, v0.4h\n" + "smlal2 v22.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x17]\n" + "smlal v23.4s, v28.4h, v5.4h\n" + "smlal2 v9.4s, v28.8h, v5.8h\n" + "smlal v24.4s, v28.4h, v2.4h\n" + "smlal2 v10.4s, v28.8h, v2.8h\n" + "ldr d28, [x19, x17]\n" + "add x17, x17, #0x8\n" + "usubl v31.8h, v31.8b, v21.8b\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "smlal v11.4s, v31.4h, v6.4h\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal v12.4s, v31.4h, v3.4h\n" + "smlal2 v22.4s, v31.8h, v3.8h\n" + "smlal v23.4s, v30.4h, v8.4h\n" + "smlal2 v9.4s, v30.8h, v8.8h\n" + "smlal v24.4s, v30.4h, v5.4h\n" + "smlal2 v10.4s, v30.8h, v5.8h\n" + "smlal v12.4s, v29.4h, v7.4h\n" + "smlal2 v22.4s, v29.8h, v7.8h\n" + "smlal v24.4s, v29.4h, v6.4h\n" + "smlal2 v10.4s, v29.8h, v6.8h\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "sqrdmulh v11.4s, v11.4s, v25.4s\n" + "sqrdmulh v26.4s, v26.4s, v16.4s\n" + "smlal v12.4s, v28.4h, v8.4h\n" + "smlal2 v22.4s, v28.8h, v8.8h\n" + "smlal v24.4s, v28.4h, v7.4h\n" + "smlal2 v10.4s, v28.8h, v7.8h\n" + "and v19.16b, v11.16b, v18.16b\n" + "and v5.16b, v26.16b, v20.16b\n" + "sqrdmulh v23.4s, v23.4s, v25.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqrdmulh v9.4s, v9.4s, v16.4s\n" + "sqadd v11.4s, v11.4s, v19.4s\n" + "sqadd v26.4s, v26.4s, v5.4s\n" + "and v28.16b, v23.16b, v18.16b\n" + "and v8.16b, v9.16b, v20.16b\n" + "srshl v11.4s, v11.4s, v18.4s\n" + "srshl v26.4s, v26.4s, v20.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "add v11.4s, v11.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v28.4s\n" + "smin v11.4s, v11.4s, v14.4s\n" + "smin v26.4s, v26.4s, v14.4s\n" + "sqadd v9.4s, v9.4s, v8.4s\n" + "smax v11.4s, v11.4s, v15.4s\n" + "smax v26.4s, v26.4s, v15.4s\n" + "srshl v23.4s, v23.4s, v18.4s\n" + "srshl v9.4s, v9.4s, v20.4s\n" + "uzp1 v11.16b, v11.16b, v26.16b\n" + "sqrdmulh v12.4s, v12.4s, v25.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x10, x15]\n" + "add v23.4s, v23.4s, v13.4s\n" + "add v9.4s, v9.4s, v13.4s\n" + "and v1.16b, v12.16b, v18.16b\n" + "sqrdmulh v22.4s, v22.4s, v16.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "smin v9.4s, v9.4s, v14.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "smax v23.4s, v23.4s, v15.4s\n" + "smax v9.4s, v9.4s, v15.4s\n" + "sqadd v12.4s, v12.4s, v1.4s\n" + "and v0.16b, v22.16b, v20.16b\n" + "uzp1 v23.16b, v23.16b, v9.16b\n" + "sqrdmulh v24.4s, v24.4s, v25.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d23, [x9, x15]\n" + "srshl v12.4s, v12.4s, v18.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "and v26.16b, v24.16b, v18.16b\n" + "sqrdmulh v10.4s, v10.4s, v16.4s\n" + "sqadd v22.4s, v22.4s, v0.4s\n" + "add v12.4s, v12.4s, v13.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v16.16b, v10.16b, v20.16b\n" + "smin v12.4s, v12.4s, v14.4s\n" + "srshl v22.4s, v22.4s, v20.4s\n" + "sqadd v24.4s, v24.4s, v26.4s\n" + "smax v12.4s, v12.4s, v15.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v22.4s, v22.4s, v13.4s\n" + "srshl v24.4s, v24.4s, v18.4s\n" + "sqadd v10.4s, v10.4s, v16.4s\n" + "smin v22.4s, v22.4s, v14.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "smax v22.4s, v22.4s, v15.4s\n" + "srshl v10.4s, v10.4s, v20.4s\n" + "smin v24.4s, v24.4s, v14.4s\n" + "uzp1 v12.16b, v12.16b, v22.16b\n" + "add v10.4s, v10.4s, v13.4s\n" + "uzp1 v12.16b, v12.16b, v12.16b\n" + "str d12, [x28, x15]\n" + "smax v24.4s, v24.4s, v15.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "smax v10.4s, v10.4s, v15.4s\n" + "uzp1 v24.16b, v24.16b, v10.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str d24, [x27, x15]\n" + "add x15, x15, #0x8\n" + "beq 64f\n" + "add x16, x16, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x8, #2, 5f\n" + "ld1 { v11.4s }, [x19], #0x10\n" + "tbz x8, #1, 4f\n" + "ld1 { v26.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v26.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v26.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x8, #1, 6f\n" + "ld1 { v11.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v11.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v23.16b, v11.16b\n" + "ldr d0, [x16, #0x0]\n" + "mov v9.16b, v26.16b\n" + "ldr d1, [x16, #0x8]\n" + "mov v12.16b, v11.16b\n" + "ldr d2, [x16, #0x10]\n" + "mov v22.16b, v26.16b\n" + "ldr d3, [x16, #0x18]\n" + "mov v24.16b, v11.16b\n" + "ldr d4, [x16, #0x20]\n" + "mov v10.16b, v26.16b\n" + "ldr d5, [x16, #0x28]\n" + "ssubl v0.8h, v0.8b, v17.8b\n" + "ldr d6, [x16, #0x30]\n" + "ssubl v1.8h, v1.8b, v17.8b\n" + "ldr d7, [x16, #0x38]\n" + "ssubl v2.8h, v2.8b, v17.8b\n" + "ldr d8, [x16, #0x40]\n" + "ssubl v3.8h, v3.8b, v17.8b\n" + "ldp x23, x22, [x14, #0x0]\n" + "add x23, x23, x17\n" + "ssubl v4.8h, v4.8b, v17.8b\n" + "ldp x21, x20, [x14, #0x10]\n" + "ssubl v5.8h, v5.8b, v17.8b\n" + "ldr x19, [x14, #0x20]\n" + "ssubl v6.8h, v6.8b, v17.8b\n" + "add x22, x22, x17\n" + "ssubl v7.8h, v7.8b, v17.8b\n" + "add x21, x21, x17\n" + "ssubl v8.8h, v8.8b, v17.8b\n" + "add x20, x20, x17\n" + "add x19, x19, x17\n" + "tbz x8, #2, 9f\n" + "ld1 { v31.s }[0], [x23], #0x4\n" + "ld1 { v30.s }[0], [x22], #0x4\n" + "ld1 { v29.s }[0], [x21], #0x4\n" + "ld1 { v28.s }[0], [x20], #0x4\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 8f\n" + "ld1 { v31.h }[2], [x23], #0x2\n" + "ld1 { v30.h }[2], [x22], #0x2\n" + "ld1 { v29.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x20], #0x2\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[6], [x23]\n" + "ld1 { v30.b }[6], [x22]\n" + "ld1 { v29.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x20]\n" + "ld1 { v27.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[4], [x23]\n" + "ld1 { v30.b }[4], [x22]\n" + "ld1 { v29.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x20]\n" + "ld1 { v27.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x8, #1, 10f\n" + "ld1 { v31.h }[0], [x23], #0x2\n" + "ld1 { v30.h }[0], [x22], #0x2\n" + "ld1 { v29.h }[0], [x21], #0x2\n" + "ld1 { v28.h }[0], [x20], #0x2\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[2], [x23]\n" + "ld1 { v30.b }[2], [x22]\n" + "ld1 { v29.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x20]\n" + "ld1 { v27.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[0], [x23]\n" + "ld1 { v30.b }[0], [x22]\n" + "ld1 { v29.b }[0], [x21]\n" + "ld1 { v28.b }[0], [x20]\n" + "ld1 { v27.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "usubl v31.8h, v31.8b, v21.8b\n" + "ldr x21, [x14, #0x28]\n" + "add x21, x21, x17\n" + "usubl v30.8h, v30.8b, v21.8b\n" + "usubl v29.8h, v29.8b, v21.8b\n" + "usubl v28.8h, v28.8b, v21.8b\n" + "usubl v27.8h, v27.8b, v21.8b\n" + "smlal v11.4s, v31.4h, v4.4h\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "smlal v23.4s, v31.4h, v3.4h\n" + "smlal2 v9.4s, v31.8h, v3.8h\n" + "smlal v12.4s, v31.4h, v1.4h\n" + "smlal2 v22.4s, v31.8h, v1.8h\n" + "smlal v24.4s, v31.4h, v0.4h\n" + "smlal2 v10.4s, v31.8h, v0.8h\n" + "smlal v11.4s, v30.4h, v0.4h\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "smlal v11.4s, v28.4h, v5.4h\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "smlal v23.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "smlal v12.4s, v28.4h, v2.4h\n" + "smlal2 v22.4s, v28.8h, v2.8h\n" + "smlal v24.4s, v28.4h, v1.4h\n" + "smlal2 v10.4s, v28.8h, v1.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v31.s }[0], [x21], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v31.h }[2], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[6], [x21]\n" + "b 15f\n" + "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[4], [x21]\n" + "b 15f\n" + "13:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x8, #1, 14f\n" + "ld1 { v31.h }[0], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[2], [x21]\n" + "b 15f\n" + "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[0], [x21]\n" + "15:" // Oddments: Load (3, 0): Bit 2: End + "usubl v31.8h, v31.8b, v21.8b\n" + "ldr x20, [x14, #0x30]\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "add x20, x20, x17\n" + "smlal v12.4s, v31.4h, v6.4h\n" + "smlal2 v22.4s, v31.8h, v6.8h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "smlal v23.4s, v27.4h, v6.4h\n" + "smlal2 v9.4s, v27.8h, v6.8h\n" + "smlal v12.4s, v27.4h, v4.4h\n" + "smlal2 v22.4s, v27.8h, v4.8h\n" + "smlal v24.4s, v27.4h, v3.4h\n" + "smlal2 v10.4s, v27.8h, v3.8h\n" + "tbz x8, #2, 17f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 19f\n" + "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 19f\n" + "17:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x8, #1, 18f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 19f\n" + "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[0], [x20]\n" + "19:" // Oddments: Load (3, 3): Bit 2: End + "usubl v29.8h, v29.8b, v21.8b\n" + "ldr x26, [x14, #0x38]\n" + "smlal v24.4s, v29.4h, v8.4h\n" + "add x26, x26, x17\n" + "smlal2 v10.4s, v29.8h, v8.8h\n" + "tbz x8, #2, 21f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x8, #1, 20f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[6], [x26]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[4], [x26]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 1): Bit 2: Unset + "tbz x8, #1, 22f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[2], [x26]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[0], [x26]\n" + "23:" // Oddments: Load (0, 1): Bit 2: End + "usubl v28.8h, v28.8b, v21.8b\n" + "ldr x25, [x14, #0x40]\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "add x25, x25, x17\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v23.4s, v28.4h, v0.4h\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "tbz x8, #2, 25f\n" + "ld1 { v31.s }[0], [x25], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v31.h }[2], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[6], [x25]\n" + "b 27f\n" + "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[4], [x25]\n" + "b 27f\n" + "25:" // Oddments: Load (0, 2): Bit 2: Unset + "tbz x8, #1, 26f\n" + "ld1 { v31.h }[0], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[2], [x25]\n" + "b 27f\n" + "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[0], [x25]\n" + "27:" // Oddments: Load (0, 2): Bit 2: End + "usubl v31.8h, v31.8b, v21.8b\n" + "ldr x19, [x14, #0x48]\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "add x19, x19, x17\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "tbz x8, #2, 29f\n" + "ld1 { v30.s }[0], [x19], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v30.h }[2], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[6], [x19]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[4], [x19]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x8, #1, 30f\n" + "ld1 { v30.h }[0], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[2], [x19]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[0], [x19]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "usubl v30.8h, v30.8b, v21.8b\n" + "ldr x24, [x14, #0x50]\n" + "smlal v11.4s, v30.4h, v8.4h\n" + "add x24, x24, x17\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v23.4s, v30.4h, v7.4h\n" + "smlal2 v9.4s, v30.8h, v7.8h\n" + "smlal v12.4s, v30.4h, v5.4h\n" + "smlal2 v22.4s, v30.8h, v5.8h\n" + "smlal v24.4s, v30.4h, v4.4h\n" + "smlal2 v10.4s, v30.8h, v4.8h\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 35f\n" + "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 35f\n" + "33:" // Oddments: Load (1, 0): Bit 2: Unset + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 35f\n" + "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x24]\n" + "35:" // Oddments: Load (1, 0): Bit 2: End + "usubl v29.8h, v29.8b, v21.8b\n" + "ldr x23, [x14, #0x58]\n" + "smlal v11.4s, v29.4h, v3.4h\n" + "add x23, x23, x17\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal v12.4s, v29.4h, v0.4h\n" + "smlal2 v22.4s, v29.8h, v0.8h\n" + "tbz x8, #2, 37f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[6], [x23]\n" + "b 39f\n" + "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[4], [x23]\n" + "b 39f\n" + "37:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x8, #1, 38f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[2], [x23]\n" + "b 39f\n" + "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[0], [x23]\n" + "39:" // Oddments: Load (1, 3): Bit 2: End + "usubl v28.8h, v28.8b, v21.8b\n" + "ldr x22, [x14, #0x60]\n" + "smlal v23.4s, v28.4h, v5.4h\n" + "add x22, x22, x17\n" + "smlal2 v9.4s, v28.8h, v5.8h\n" + "smlal v24.4s, v28.4h, v2.4h\n" + "smlal2 v10.4s, v28.8h, v2.8h\n" + "tbz x8, #2, 41f\n" + "ld1 { v31.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v31.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x8, #1, 42f\n" + "ld1 { v31.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 0): Bit 2: End + "usubl v31.8h, v31.8b, v21.8b\n" + "ldr x21, [x14, #0x68]\n" + "smlal v11.4s, v31.4h, v6.4h\n" + "add x21, x21, x17\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal v12.4s, v31.4h, v3.4h\n" + "smlal2 v22.4s, v31.8h, v3.8h\n" + "tbz x8, #2, 45f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x8, #1, 46f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[0], [x21]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "usubl v30.8h, v30.8b, v21.8b\n" + "ldr x20, [x14, #0x70]\n" + "smlal v23.4s, v30.4h, v8.4h\n" + "add x20, x20, x17\n" + "smlal2 v9.4s, v30.8h, v8.8h\n" + "smlal v24.4s, v30.4h, v5.4h\n" + "smlal2 v10.4s, v30.8h, v5.8h\n" + "tbz x8, #2, 49f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x8, #1, 50f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[0], [x20]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "usubl v29.8h, v29.8b, v21.8b\n" + "ldr x19, [x14, #0x78]\n" + "smlal v12.4s, v29.4h, v7.4h\n" + "add x19, x19, x17\n" + "smlal2 v22.4s, v29.8h, v7.8h\n" + "smlal v24.4s, v29.4h, v6.4h\n" + "smlal2 v10.4s, v29.8h, v6.8h\n" + "tbz x8, #2, 53f\n" + "ld1 { v28.s }[0], [x19], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v28.h }[2], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[6], [x19]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[4], [x19]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x8, #1, 54f\n" + "ld1 { v28.h }[0], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[2], [x19]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[0], [x19]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "usubl v28.8h, v28.8b, v21.8b\n" + "smlal v12.4s, v28.4h, v8.4h\n" + "smlal2 v22.4s, v28.8h, v8.8h\n" + "smlal v24.4s, v28.4h, v7.4h\n" + "smlal2 v10.4s, v28.8h, v7.8h\n" + "tbz x8, #2, 57f\n" + "ld1 { v25.4s }, [x13], #0x10\n" + "ld1 { v18.4s }, [x11], #0x10\n" + "tbz x8, #1, 56f\n" + "ld1 { v16.d }[0], [x13], #0x8\n" + "ld1 { v20.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v16.s }[2], [x13]\n" + "ld1 { v20.s }[2], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v16.s }[0], [x13]\n" + "ld1 { v20.s }[0], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load requant params: Bit 2: Unset + "tbz x8, #1, 58f\n" + "ld1 { v25.d }[0], [x13], #0x8\n" + "ld1 { v18.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v25.s }[2], [x13]\n" + "ld1 { v18.s }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v25.s }[0], [x13]\n" + "ld1 { v18.s }[0], [x11]\n" + "59:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v11.4s, v11.4s, v25.4s\n" + "add x10, x10, x15\n" + "sqrdmulh v26.4s, v26.4s, v16.4s\n" + "add x9, x9, x15\n" + "sqrdmulh v23.4s, v23.4s, v25.4s\n" + "add x28, x28, x15\n" + "sqrdmulh v9.4s, v9.4s, v16.4s\n" + "add x27, x27, x15\n" + "sqrdmulh v12.4s, v12.4s, v25.4s\n" + "and v19.16b, v11.16b, v18.16b\n" + "and v5.16b, v26.16b, v20.16b\n" + "and v28.16b, v23.16b, v18.16b\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v19.4s\n" + "sqadd v26.4s, v26.4s, v5.4s\n" + "sqadd v23.4s, v23.4s, v28.4s\n" + "and v8.16b, v9.16b, v20.16b\n" + "srshl v11.4s, v11.4s, v18.4s\n" + "srshl v26.4s, v26.4s, v20.4s\n" + "srshl v23.4s, v23.4s, v18.4s\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "add v11.4s, v11.4s, v13.4s\n" + "add v26.4s, v26.4s, v13.4s\n" + "add v23.4s, v23.4s, v13.4s\n" + "smin v11.4s, v11.4s, v14.4s\n" + "smin v26.4s, v26.4s, v14.4s\n" + "smin v23.4s, v23.4s, v14.4s\n" + "smax v11.4s, v11.4s, v15.4s\n" + "smax v26.4s, v26.4s, v15.4s\n" + "smax v23.4s, v23.4s, v15.4s\n" + "sqadd v9.4s, v9.4s, v8.4s\n" + "uzp1 v11.16b, v11.16b, v26.16b\n" + "and v1.16b, v12.16b, v18.16b\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "srshl v9.4s, v9.4s, v20.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "sqrdmulh v22.4s, v22.4s, v16.4s\n" + "sqrdmulh v24.4s, v24.4s, v25.4s\n" + "add v9.4s, v9.4s, v13.4s\n" + "sqadd v12.4s, v12.4s, v1.4s\n" + "and v0.16b, v22.16b, v20.16b\n" + "smin v9.4s, v9.4s, v14.4s\n" + "and v26.16b, v24.16b, v18.16b\n" + "srshl v12.4s, v12.4s, v18.4s\n" + "smax v9.4s, v9.4s, v15.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "uzp1 v23.16b, v23.16b, v9.16b\n" + "add v12.4s, v12.4s, v13.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v22.4s, v22.4s, v0.4s\n" + "smin v12.4s, v12.4s, v14.4s\n" + "sqadd v24.4s, v24.4s, v26.4s\n" + "sqrdmulh v10.4s, v10.4s, v16.4s\n" + "smax v12.4s, v12.4s, v15.4s\n" + "srshl v22.4s, v22.4s, v20.4s\n" + "srshl v24.4s, v24.4s, v18.4s\n" + "and v16.16b, v10.16b, v20.16b\n" + "add v22.4s, v22.4s, v13.4s\n" + "add v24.4s, v24.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v22.4s, v22.4s, v14.4s\n" + "smin v24.4s, v24.4s, v14.4s\n" + "sqadd v10.4s, v10.4s, v16.4s\n" + "smax v22.4s, v22.4s, v15.4s\n" + "smax v24.4s, v24.4s, v15.4s\n" + "srshl v10.4s, v10.4s, v20.4s\n" + "uzp1 v12.16b, v12.16b, v22.16b\n" + "uzp1 v12.16b, v12.16b, v12.16b\n" + "add v10.4s, v10.4s, v13.4s\n" + "smin v10.4s, v10.4s, v14.4s\n" + "smax v10.4s, v10.4s, v15.4s\n" + "uzp1 v24.16b, v24.16b, v10.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "tbz x8, #2, 61f\n" + "st1 { v11.s }[0], [x10], #0x4\n" + "st1 { v23.s }[0], [x9], #0x4\n" + "st1 { v12.s }[0], [x28], #0x4\n" + "st1 { v24.s }[0], [x27], #0x4\n" + "tbz x8, #1, 60f\n" + "st1 { v11.h }[2], [x10], #0x2\n" + "st1 { v23.h }[2], [x9], #0x2\n" + "st1 { v12.h }[2], [x28], #0x2\n" + "st1 { v24.h }[2], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v11.b }[6], [x10], #0x1\n" + "st1 { v23.b }[6], [x9], #0x1\n" + "st1 { v12.b }[6], [x28], #0x1\n" + "st1 { v24.b }[6], [x27], #0x1\n" + "b 63f\n" + "60:" // Oddments: Bit 2: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v11.b }[4], [x10], #0x1\n" + "st1 { v23.b }[4], [x9], #0x1\n" + "st1 { v12.b }[4], [x28], #0x1\n" + "st1 { v24.b }[4], [x27], #0x1\n" + "b 63f\n" + "61:" // Oddments: Bit 2: Unset + "tbz x8, #1, 62f\n" + "st1 { v11.h }[0], [x10], #0x2\n" + "st1 { v23.h }[0], [x9], #0x2\n" + "st1 { v12.h }[0], [x28], #0x2\n" + "st1 { v24.h }[0], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v11.b }[2], [x10], #0x1\n" + "st1 { v23.b }[2], [x9], #0x1\n" + "st1 { v12.b }[2], [x28], #0x1\n" + "st1 { v24.b }[2], [x27], #0x1\n" + "b 63f\n" + "62:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v11.b }[0], [x10], #0x1\n" + "st1 { v23.b }[0], [x9], #0x1\n" + "st1 { v12.b }[0], [x28], #0x1\n" + "st1 { v24.b }[0], [x27], #0x1\n" + "63:" // Oddments: Bit 2: End + + "64:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..77861e94f0 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + + kern_type kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + + a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..4e1586b033 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x4, #0x0\n" + "ldr x5, [%x[params], %[offsetof_Params_weights]]\n" + "mov x6, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x7, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x17, x3, #0x3\n" + "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v22.16b }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v12.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v14.4s }, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v16.4s }, [x20]\n" + "ld1r { v15.4s }, [x19]\n" + "ldp x15, x14, [x21, #0x0]\n" + "ldp x13, x12, [x21, #0x10]\n" + "cbz x17, 3f\n" + "subs x17, x17, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q13, [x19, #0x0]\n" + "mov v19.16b, v13.16b\n" + "ldr q10, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v11.16b, v13.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v18.16b, v13.16b\n" + "ldr d0, [x5, #0x0]\n" + "ldr d1, [x5, #0x8]\n" + "mov v20.16b, v10.16b\n" + "ldr d2, [x5, #0x10]\n" + "mov v17.16b, v10.16b\n" + "ldr d3, [x5, #0x18]\n" + "mov v21.16b, v10.16b\n" + "ldr d4, [x5, #0x20]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ldr d5, [x5, #0x28]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d6, [x5, #0x30]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ldr d7, [x5, #0x38]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr d8, [x5, #0x40]\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ldp x26, x25, [x7, #0x0]\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ldp x24, x23, [x7, #0x10]\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ldp x22, x21, [x7, #0x20]\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "ldp x20, x19, [x7, #0x30]\n" + "ldr d31, [x26, x4]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr d30, [x25, x4]\n" + "ldr d29, [x24, x4]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr d28, [x23, x4]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr d27, [x22, x4]\n" + "ldr d26, [x21, x4]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr d25, [x20, x4]\n" + "ldr d24, [x19, x4]\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v13.4s, v31.4h, v8.4h\n" + "ldr x22, [x7, #0x40]\n" + "add x5, x5, #0x48\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ldr x21, [x7, #0x48]\n" + "subs x17, x17, #0x1\n" + "smlal v19.4s, v31.4h, v6.4h\n" + "ldr x20, [x7, #0x50]\n" + "smlal2 v20.4s, v31.8h, v6.8h\n" + "ldr x19, [x7, #0x58]\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "ldr x11, [x7, #0x60]\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "ldr x10, [x7, #0x68]\n" + "smlal v18.4s, v31.4h, v0.4h\n" + "ldr x9, [x7, #0x70]\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "ldr x28, [x7, #0x78]\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x27, [x7, #0x80]\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x26, [x7, #0x88]\n" + "smlal v19.4s, v28.4h, v1.4h\n" + "ldr x25, [x7, #0x90]\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x21, x4]\n" + "smlal v13.4s, v29.4h, v1.4h\n" + "ldr x24, [x7, #0x98]\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "ldr d29, [x22, x4]\n" + "smlal v19.4s, v27.4h, v2.4h\n" + "ldr x23, [x7, #0xa0]\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "ldr d27, [x20, x4]\n" + "smlal v13.4s, v26.4h, v3.4h\n" + "ldr x22, [x7, #0xa8]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "ldr d26, [x19, x4]\n" + "smlal v19.4s, v24.4h, v0.4h\n" + "ldr x21, [x7, #0xb0]\n" + "smlal2 v20.4s, v24.8h, v0.8h\n" + "ldr x20, [x7, #0xb8]\n" + "smlal v13.4s, v25.4h, v4.4h\n" + "ldr x19, [x7, #0xc0]\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "ldr d25, [x11, x4]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr q31, [x8, #0x0]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr q30, [x16, #0x0]\n" + "smlal v13.4s, v24.4h, v2.4h\n" + "ldr q23, [x8, #0x10]\n" + "add x8, x8, #0x20\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "ldr d24, [x9, x4]\n" + "smlal v19.4s, v29.4h, v4.4h\n" + "ldr q9, [x16, #0x10]\n" + "add x16, x16, #0x20\n" + "smlal2 v20.4s, v29.8h, v4.8h\n" + "ldr d29, [x10, x4]\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal v13.4s, v27.4h, v5.4h\n" + "smlal2 v20.4s, v28.8h, v5.8h\n" + "ldr d28, [x27, x4]\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "smlal v19.4s, v27.4h, v3.4h\n" + "smlal v11.4s, v26.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "ldr d27, [x28, x4]\n" + "smlal2 v17.4s, v26.8h, v3.8h\n" + "ldr d26, [x26, x4]\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "smlal v13.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v17.4s, v25.8h, v0.8h\n" + "ldr d25, [x25, x4]\n" + "smlal v13.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v11.4s, v29.4h, v4.4h\n" + "smlal2 v17.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x4]\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v17.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x4]\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x4]\n" + "smlal v19.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "smlal v18.4s, v28.4h, v1.4h\n" + "smlal2 v21.4s, v28.8h, v1.8h\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v26.4h, v5.4h\n" + "smlal2 v21.4s, v26.8h, v5.8h\n" + "ldr d26, [x21, x4]\n" + "smlal v11.4s, v25.4h, v6.4h\n" + "smlal2 v17.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x4]\n" + "smlal v19.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "smlal v18.4s, v29.4h, v2.4h\n" + "smlal2 v21.4s, v29.8h, v2.8h\n" + "ldr d29, [x19, x4]\n" + "add x4, x4, #0x8\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal2 v17.4s, v27.8h, v7.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v5.4h\n" + "smlal2 v17.4s, v24.8h, v5.8h\n" + "smlal v18.4s, v26.4h, v7.4h\n" + "smlal2 v21.4s, v26.8h, v7.8h\n" + "smlal v11.4s, v25.4h, v8.4h\n" + "smlal2 v17.4s, v25.8h, v8.8h\n" + "smlal v18.4s, v25.4h, v6.4h\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "sqrdmulh v13.4s, v13.4s, v31.4s\n" + "sqrdmulh v10.4s, v10.4s, v23.4s\n" + "smlal v18.4s, v29.4h, v8.4h\n" + "smlal2 v21.4s, v29.8h, v8.8h\n" + "and v27.16b, v13.16b, v30.16b\n" + "and v7.16b, v10.16b, v9.16b\n" + "sqrdmulh v19.4s, v19.4s, v31.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqadd v13.4s, v13.4s, v27.4s\n" + "sqadd v10.4s, v10.4s, v7.4s\n" + "and v6.16b, v19.16b, v30.16b\n" + "and v3.16b, v20.16b, v9.16b\n" + "srshl v13.4s, v13.4s, v30.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "add v13.4s, v13.4s, v14.4s\n" + "add v10.4s, v10.4s, v14.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "smin v13.4s, v13.4s, v15.4s\n" + "smin v10.4s, v10.4s, v15.4s\n" + "sqadd v20.4s, v20.4s, v3.4s\n" + "smax v13.4s, v13.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "srshl v19.4s, v19.4s, v30.4s\n" + "srshl v20.4s, v20.4s, v9.4s\n" + "uzp1 v13.16b, v13.16b, v10.16b\n" + "sqrdmulh v11.4s, v11.4s, v31.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "str d13, [x15, x6]\n" + "add v19.4s, v19.4s, v14.4s\n" + "add v20.4s, v20.4s, v14.4s\n" + "and v28.16b, v11.16b, v30.16b\n" + "sqrdmulh v17.4s, v17.4s, v23.4s\n" + "smin v19.4s, v19.4s, v15.4s\n" + "smin v20.4s, v20.4s, v15.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "smax v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "sqadd v11.4s, v11.4s, v28.4s\n" + "and v26.16b, v17.16b, v9.16b\n" + "uzp1 v19.16b, v19.16b, v20.16b\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d19, [x14, x6]\n" + "srshl v11.4s, v11.4s, v30.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v8.16b, v18.16b, v30.16b\n" + "sqrdmulh v21.4s, v21.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "add v11.4s, v11.4s, v14.4s\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v27.16b, v21.16b, v9.16b\n" + "smin v11.4s, v11.4s, v15.4s\n" + "srshl v17.4s, v17.4s, v9.4s\n" + "sqadd v18.4s, v18.4s, v8.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "add v17.4s, v17.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v27.4s\n" + "smin v17.4s, v17.4s, v15.4s\n" + "add v18.4s, v18.4s, v14.4s\n" + "smax v17.4s, v17.4s, v16.4s\n" + "srshl v21.4s, v21.4s, v9.4s\n" + "smin v18.4s, v18.4s, v15.4s\n" + "uzp1 v11.16b, v11.16b, v17.16b\n" + "add v21.4s, v21.4s, v14.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x13, x6]\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smin v21.4s, v21.4s, v15.4s\n" + "smax v21.4s, v21.4s, v16.4s\n" + "uzp1 v18.16b, v18.16b, v21.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d18, [x12, x6]\n" + "add x6, x6, #0x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q13, [x19, #0x0]\n" + "mov v19.16b, v13.16b\n" + "ldr q10, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "mov v11.16b, v13.16b\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "mov v18.16b, v13.16b\n" + "ldr d0, [x5, #0x0]\n" + "ldr d1, [x5, #0x8]\n" + "mov v20.16b, v10.16b\n" + "ldr d2, [x5, #0x10]\n" + "mov v17.16b, v10.16b\n" + "ldr d3, [x5, #0x18]\n" + "mov v21.16b, v10.16b\n" + "ldr d4, [x5, #0x20]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ldr d5, [x5, #0x28]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d6, [x5, #0x30]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ldr d7, [x5, #0x38]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr d8, [x5, #0x40]\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ldp x26, x25, [x7, #0x0]\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ldp x24, x23, [x7, #0x10]\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ldp x22, x21, [x7, #0x20]\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "ldp x20, x19, [x7, #0x30]\n" + "ldr d31, [x26, x4]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr d30, [x25, x4]\n" + "ldr d29, [x24, x4]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr d28, [x23, x4]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr d27, [x22, x4]\n" + "ldr d26, [x21, x4]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr d25, [x20, x4]\n" + "ldr d24, [x19, x4]\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v13.4s, v31.4h, v8.4h\n" + "ldr x22, [x7, #0x40]\n" + "tst x3, #0x7\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "ldr x21, [x7, #0x48]\n" + "smlal v19.4s, v31.4h, v6.4h\n" + "ldr x20, [x7, #0x50]\n" + "smlal2 v20.4s, v31.8h, v6.8h\n" + "ldr x19, [x7, #0x58]\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "ldr x11, [x7, #0x60]\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "ldr x10, [x7, #0x68]\n" + "smlal v18.4s, v31.4h, v0.4h\n" + "ldr x9, [x7, #0x70]\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "ldr x28, [x7, #0x78]\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x27, [x7, #0x80]\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x26, [x7, #0x88]\n" + "smlal v19.4s, v28.4h, v1.4h\n" + "ldr x25, [x7, #0x90]\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "ldr d28, [x21, x4]\n" + "smlal v13.4s, v29.4h, v1.4h\n" + "ldr x24, [x7, #0x98]\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "ldr d29, [x22, x4]\n" + "smlal v19.4s, v27.4h, v2.4h\n" + "ldr x23, [x7, #0xa0]\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "ldr d27, [x20, x4]\n" + "smlal v13.4s, v26.4h, v3.4h\n" + "ldr x22, [x7, #0xa8]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "ldr d26, [x19, x4]\n" + "smlal v19.4s, v24.4h, v0.4h\n" + "ldr x21, [x7, #0xb0]\n" + "smlal2 v20.4s, v24.8h, v0.8h\n" + "ldr x20, [x7, #0xb8]\n" + "smlal v13.4s, v25.4h, v4.4h\n" + "ldr x19, [x7, #0xc0]\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "ldr d25, [x11, x4]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr q31, [x8, #0x0]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr q30, [x16, #0x0]\n" + "smlal v13.4s, v24.4h, v2.4h\n" + "ldr q23, [x8, #0x10]\n" + "add x8, x8, #0x20\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "ldr d24, [x9, x4]\n" + "smlal v19.4s, v29.4h, v4.4h\n" + "ldr q9, [x16, #0x10]\n" + "add x16, x16, #0x20\n" + "smlal2 v20.4s, v29.8h, v4.8h\n" + "ldr d29, [x10, x4]\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal v13.4s, v27.4h, v5.4h\n" + "smlal2 v20.4s, v28.8h, v5.8h\n" + "ldr d28, [x27, x4]\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "smlal v19.4s, v27.4h, v3.4h\n" + "smlal v11.4s, v26.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "ldr d27, [x28, x4]\n" + "smlal2 v17.4s, v26.8h, v3.8h\n" + "ldr d26, [x26, x4]\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "smlal v13.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v17.4s, v25.8h, v0.8h\n" + "ldr d25, [x25, x4]\n" + "smlal v13.4s, v24.4h, v7.4h\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v11.4s, v29.4h, v4.4h\n" + "smlal2 v17.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x4]\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v17.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x4]\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x4]\n" + "smlal v19.4s, v28.4h, v7.4h\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "smlal v18.4s, v28.4h, v1.4h\n" + "smlal2 v21.4s, v28.8h, v1.8h\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v26.4h, v5.4h\n" + "smlal2 v21.4s, v26.8h, v5.8h\n" + "ldr d26, [x21, x4]\n" + "smlal v11.4s, v25.4h, v6.4h\n" + "smlal2 v17.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x4]\n" + "smlal v19.4s, v29.4h, v8.4h\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "smlal v18.4s, v29.4h, v2.4h\n" + "smlal2 v21.4s, v29.8h, v2.8h\n" + "ldr d29, [x19, x4]\n" + "add x4, x4, #0x8\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal2 v17.4s, v27.8h, v7.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v5.4h\n" + "smlal2 v17.4s, v24.8h, v5.8h\n" + "smlal v18.4s, v26.4h, v7.4h\n" + "smlal2 v21.4s, v26.8h, v7.8h\n" + "smlal v11.4s, v25.4h, v8.4h\n" + "smlal2 v17.4s, v25.8h, v8.8h\n" + "smlal v18.4s, v25.4h, v6.4h\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "sqrdmulh v13.4s, v13.4s, v31.4s\n" + "sqrdmulh v10.4s, v10.4s, v23.4s\n" + "smlal v18.4s, v29.4h, v8.4h\n" + "smlal2 v21.4s, v29.8h, v8.8h\n" + "and v27.16b, v13.16b, v30.16b\n" + "and v7.16b, v10.16b, v9.16b\n" + "sqrdmulh v19.4s, v19.4s, v31.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "sqadd v13.4s, v13.4s, v27.4s\n" + "sqadd v10.4s, v10.4s, v7.4s\n" + "and v6.16b, v19.16b, v30.16b\n" + "and v3.16b, v20.16b, v9.16b\n" + "srshl v13.4s, v13.4s, v30.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "add v13.4s, v13.4s, v14.4s\n" + "add v10.4s, v10.4s, v14.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "smin v13.4s, v13.4s, v15.4s\n" + "smin v10.4s, v10.4s, v15.4s\n" + "sqadd v20.4s, v20.4s, v3.4s\n" + "smax v13.4s, v13.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "srshl v19.4s, v19.4s, v30.4s\n" + "srshl v20.4s, v20.4s, v9.4s\n" + "uzp1 v13.16b, v13.16b, v10.16b\n" + "sqrdmulh v11.4s, v11.4s, v31.4s\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "str d13, [x15, x6]\n" + "add v19.4s, v19.4s, v14.4s\n" + "add v20.4s, v20.4s, v14.4s\n" + "and v28.16b, v11.16b, v30.16b\n" + "sqrdmulh v17.4s, v17.4s, v23.4s\n" + "smin v19.4s, v19.4s, v15.4s\n" + "smin v20.4s, v20.4s, v15.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "smax v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "sqadd v11.4s, v11.4s, v28.4s\n" + "and v26.16b, v17.16b, v9.16b\n" + "uzp1 v19.16b, v19.16b, v20.16b\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d19, [x14, x6]\n" + "srshl v11.4s, v11.4s, v30.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v8.16b, v18.16b, v30.16b\n" + "sqrdmulh v21.4s, v21.4s, v23.4s\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "add v11.4s, v11.4s, v14.4s\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "and v27.16b, v21.16b, v9.16b\n" + "smin v11.4s, v11.4s, v15.4s\n" + "srshl v17.4s, v17.4s, v9.4s\n" + "sqadd v18.4s, v18.4s, v8.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "add v17.4s, v17.4s, v14.4s\n" + "srshl v18.4s, v18.4s, v30.4s\n" + "sqadd v21.4s, v21.4s, v27.4s\n" + "smin v17.4s, v17.4s, v15.4s\n" + "add v18.4s, v18.4s, v14.4s\n" + "smax v17.4s, v17.4s, v16.4s\n" + "srshl v21.4s, v21.4s, v9.4s\n" + "smin v18.4s, v18.4s, v15.4s\n" + "uzp1 v11.16b, v11.16b, v17.16b\n" + "add v21.4s, v21.4s, v14.4s\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d11, [x13, x6]\n" + "smax v18.4s, v18.4s, v16.4s\n" + "smin v21.4s, v21.4s, v15.4s\n" + "smax v21.4s, v21.4s, v16.4s\n" + "uzp1 v18.16b, v18.16b, v21.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d18, [x12, x6]\n" + "add x6, x6, #0x8\n" + "beq 88f\n" + "add x5, x5, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x3, #2, 5f\n" + "ld1 { v13.4s }, [x19], #0x10\n" + "tbz x3, #1, 4f\n" + "ld1 { v10.d }[0], [x19], #0x8\n" + "tbz x3, #0, 7f\n" + "ld1 { v10.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x3, #0, 7f\n" + "ld1 { v10.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x3, #1, 6f\n" + "ld1 { v13.d }[0], [x19], #0x8\n" + "tbz x3, #0, 7f\n" + "ld1 { v13.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 7f\n" + "ld1 { v13.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v19.16b, v13.16b\n" + "ldr d0, [x5, #0x0]\n" + "mov v20.16b, v10.16b\n" + "ldr d1, [x5, #0x8]\n" + "mov v11.16b, v13.16b\n" + "ldr d2, [x5, #0x10]\n" + "mov v17.16b, v10.16b\n" + "ldr d3, [x5, #0x18]\n" + "mov v18.16b, v13.16b\n" + "ldr d4, [x5, #0x20]\n" + "mov v21.16b, v10.16b\n" + "ldr d5, [x5, #0x28]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ldr d6, [x5, #0x30]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d7, [x5, #0x38]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ldr d8, [x5, #0x40]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldp x26, x25, [x7, #0x0]\n" + "add x26, x26, x4\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ldp x24, x23, [x7, #0x10]\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ldp x22, x21, [x7, #0x20]\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "add x25, x25, x4\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ldp x20, x19, [x7, #0x30]\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "add x24, x24, x4\n" + "add x23, x23, x4\n" + "add x22, x22, x4\n" + "add x21, x21, x4\n" + "add x20, x20, x4\n" + "add x19, x19, x4\n" + "tbz x3, #2, 9f\n" + "ld1 { v31.s }[0], [x26], #0x4\n" + "ld1 { v30.s }[0], [x25], #0x4\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x3, #1, 8f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v30.h }[2], [x25], #0x2\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x3, #0, 11f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v30.b }[6], [x25]\n" + "ld1 { v29.b }[6], [x24]\n" + "ld1 { v28.b }[6], [x23]\n" + "ld1 { v27.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x3, #0, 11f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v30.b }[4], [x25]\n" + "ld1 { v29.b }[4], [x24]\n" + "ld1 { v28.b }[4], [x23]\n" + "ld1 { v27.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "ld1 { v24.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x3, #1, 10f\n" + "ld1 { v31.h }[0], [x26], #0x2\n" + "ld1 { v30.h }[0], [x25], #0x2\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x3, #0, 11f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v30.b }[2], [x25]\n" + "ld1 { v29.b }[2], [x24]\n" + "ld1 { v28.b }[2], [x23]\n" + "ld1 { v27.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 11f\n" + "ld1 { v31.b }[0], [x26]\n" + "ld1 { v30.b }[0], [x25]\n" + "ld1 { v29.b }[0], [x24]\n" + "ld1 { v28.b }[0], [x23]\n" + "ld1 { v27.b }[0], [x22]\n" + "ld1 { v26.b }[0], [x21]\n" + "ld1 { v25.b }[0], [x20]\n" + "ld1 { v24.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr x22, [x7, #0x40]\n" + "add x22, x22, x4\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "usubl v26.8h, v26.8b, v22.8b\n" + "usubl v25.8h, v25.8b, v22.8b\n" + "usubl v24.8h, v24.8b, v22.8b\n" + "smlal v13.4s, v31.4h, v8.4h\n" + "smlal2 v10.4s, v31.8h, v8.8h\n" + "smlal v19.4s, v31.4h, v6.4h\n" + "smlal2 v20.4s, v31.8h, v6.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "smlal v18.4s, v31.4h, v0.4h\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "smlal v13.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "smlal v19.4s, v28.4h, v1.4h\n" + "smlal2 v20.4s, v28.8h, v1.8h\n" + "smlal v13.4s, v29.4h, v1.4h\n" + "smlal2 v10.4s, v29.8h, v1.8h\n" + "smlal v19.4s, v27.4h, v2.4h\n" + "smlal2 v20.4s, v27.8h, v2.8h\n" + "smlal v13.4s, v26.4h, v3.4h\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "smlal v19.4s, v24.4h, v0.4h\n" + "smlal2 v20.4s, v24.8h, v0.8h\n" + "smlal v13.4s, v25.4h, v4.4h\n" + "smlal2 v10.4s, v25.8h, v4.8h\n" + "smlal v13.4s, v24.4h, v2.4h\n" + "smlal2 v10.4s, v24.8h, v2.8h\n" + "tbz x3, #2, 13f\n" + "ld1 { v29.s }[0], [x22], #0x4\n" + "tbz x3, #1, 12f\n" + "ld1 { v29.h }[2], [x22], #0x2\n" + "tbz x3, #0, 15f\n" + "ld1 { v29.b }[6], [x22]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x3, #0, 15f\n" + "ld1 { v29.b }[4], [x22]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x3, #1, 14f\n" + "ld1 { v29.h }[0], [x22], #0x2\n" + "tbz x3, #0, 15f\n" + "ld1 { v29.b }[2], [x22]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 15f\n" + "ld1 { v29.b }[0], [x22]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x21, [x7, #0x48]\n" + "smlal v19.4s, v29.4h, v4.4h\n" + "add x21, x21, x4\n" + "smlal2 v20.4s, v29.8h, v4.8h\n" + "tbz x3, #2, 17f\n" + "ld1 { v28.s }[0], [x21], #0x4\n" + "tbz x3, #1, 16f\n" + "ld1 { v28.h }[2], [x21], #0x2\n" + "tbz x3, #0, 19f\n" + "ld1 { v28.b }[6], [x21]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x3, #0, 19f\n" + "ld1 { v28.b }[4], [x21]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x3, #1, 18f\n" + "ld1 { v28.h }[0], [x21], #0x2\n" + "tbz x3, #0, 19f\n" + "ld1 { v28.b }[2], [x21]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 19f\n" + "ld1 { v28.b }[0], [x21]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr x20, [x7, #0x50]\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "add x20, x20, x4\n" + "smlal2 v20.4s, v28.8h, v5.8h\n" + "tbz x3, #2, 21f\n" + "ld1 { v27.s }[0], [x20], #0x4\n" + "tbz x3, #1, 20f\n" + "ld1 { v27.h }[2], [x20], #0x2\n" + "tbz x3, #0, 23f\n" + "ld1 { v27.b }[6], [x20]\n" + "b 23f\n" + "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset + "tbz x3, #0, 23f\n" + "ld1 { v27.b }[4], [x20]\n" + "b 23f\n" + "21:" // Oddments: Load (1, 2): Bit 2: Unset + "tbz x3, #1, 22f\n" + "ld1 { v27.h }[0], [x20], #0x2\n" + "tbz x3, #0, 23f\n" + "ld1 { v27.b }[2], [x20]\n" + "b 23f\n" + "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 23f\n" + "ld1 { v27.b }[0], [x20]\n" + "23:" // Oddments: Load (1, 2): Bit 2: End + "usubl v27.8h, v27.8b, v22.8b\n" + "ldr x19, [x7, #0x58]\n" + "smlal v13.4s, v27.4h, v5.4h\n" + "add x19, x19, x4\n" + "smlal2 v10.4s, v27.8h, v5.8h\n" + "smlal v19.4s, v27.4h, v3.4h\n" + "smlal2 v20.4s, v27.8h, v3.8h\n" + "tbz x3, #2, 25f\n" + "ld1 { v26.s }[0], [x19], #0x4\n" + "tbz x3, #1, 24f\n" + "ld1 { v26.h }[2], [x19], #0x2\n" + "tbz x3, #0, 27f\n" + "ld1 { v26.b }[6], [x19]\n" + "b 27f\n" + "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x3, #0, 27f\n" + "ld1 { v26.b }[4], [x19]\n" + "b 27f\n" + "25:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x3, #1, 26f\n" + "ld1 { v26.h }[0], [x19], #0x2\n" + "tbz x3, #0, 27f\n" + "ld1 { v26.b }[2], [x19]\n" + "b 27f\n" + "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 27f\n" + "ld1 { v26.b }[0], [x19]\n" + "27:" // Oddments: Load (3, 0): Bit 2: End + "usubl v26.8h, v26.8b, v22.8b\n" + "ldr x11, [x7, #0x60]\n" + "smlal v11.4s, v26.4h, v3.4h\n" + "add x11, x11, x4\n" + "smlal2 v17.4s, v26.8h, v3.8h\n" + "tbz x3, #2, 29f\n" + "ld1 { v25.s }[0], [x11], #0x4\n" + "tbz x3, #1, 28f\n" + "ld1 { v25.h }[2], [x11], #0x2\n" + "tbz x3, #0, 31f\n" + "ld1 { v25.b }[6], [x11]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x3, #0, 31f\n" + "ld1 { v25.b }[4], [x11]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x3, #1, 30f\n" + "ld1 { v25.h }[0], [x11], #0x2\n" + "tbz x3, #0, 31f\n" + "ld1 { v25.b }[2], [x11]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 31f\n" + "ld1 { v25.b }[0], [x11]\n" + "31:" // Oddments: Load (2, 0): Bit 2: End + "usubl v25.8h, v25.8b, v22.8b\n" + "ldr x10, [x7, #0x68]\n" + "smlal v13.4s, v25.4h, v6.4h\n" + "add x10, x10, x4\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v17.4s, v25.8h, v0.8h\n" + "tbz x3, #2, 33f\n" + "ld1 { v29.s }[0], [x10], #0x4\n" + "tbz x3, #1, 32f\n" + "ld1 { v29.h }[2], [x10], #0x2\n" + "tbz x3, #0, 35f\n" + "ld1 { v29.b }[6], [x10]\n" + "b 35f\n" + "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x3, #0, 35f\n" + "ld1 { v29.b }[4], [x10]\n" + "b 35f\n" + "33:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x3, #1, 34f\n" + "ld1 { v29.h }[0], [x10], #0x2\n" + "tbz x3, #0, 35f\n" + "ld1 { v29.b }[2], [x10]\n" + "b 35f\n" + "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 35f\n" + "ld1 { v29.b }[0], [x10]\n" + "35:" // Oddments: Load (3, 1): Bit 2: End + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x9, [x7, #0x70]\n" + "smlal v11.4s, v29.4h, v4.4h\n" + "add x9, x9, x4\n" + "smlal2 v17.4s, v29.8h, v4.8h\n" + "tbz x3, #2, 37f\n" + "ld1 { v24.s }[0], [x9], #0x4\n" + "tbz x3, #1, 36f\n" + "ld1 { v24.h }[2], [x9], #0x2\n" + "tbz x3, #0, 39f\n" + "ld1 { v24.b }[6], [x9]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x3, #0, 39f\n" + "ld1 { v24.b }[4], [x9]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x3, #1, 38f\n" + "ld1 { v24.h }[0], [x9], #0x2\n" + "tbz x3, #0, 39f\n" + "ld1 { v24.b }[2], [x9]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 39f\n" + "ld1 { v24.b }[0], [x9]\n" + "39:" // Oddments: Load (2, 1): Bit 2: End + "usubl v24.8h, v24.8b, v22.8b\n" + "ldr x28, [x7, #0x78]\n" + "smlal v13.4s, v24.4h, v7.4h\n" + "add x28, x28, x4\n" + "smlal2 v10.4s, v24.8h, v7.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v17.4s, v24.8h, v1.8h\n" + "tbz x3, #2, 41f\n" + "ld1 { v27.s }[0], [x28], #0x4\n" + "tbz x3, #1, 40f\n" + "ld1 { v27.h }[2], [x28], #0x2\n" + "tbz x3, #0, 43f\n" + "ld1 { v27.b }[6], [x28]\n" + "b 43f\n" + "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x3, #0, 43f\n" + "ld1 { v27.b }[4], [x28]\n" + "b 43f\n" + "41:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x3, #1, 42f\n" + "ld1 { v27.h }[0], [x28], #0x2\n" + "tbz x3, #0, 43f\n" + "ld1 { v27.b }[2], [x28]\n" + "b 43f\n" + "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 43f\n" + "ld1 { v27.b }[0], [x28]\n" + "43:" // Oddments: Load (3, 3): Bit 2: End + "usubl v27.8h, v27.8b, v22.8b\n" + "ldr x27, [x7, #0x80]\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "add x27, x27, x4\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "tbz x3, #2, 45f\n" + "ld1 { v28.s }[0], [x27], #0x4\n" + "tbz x3, #1, 44f\n" + "ld1 { v28.h }[2], [x27], #0x2\n" + "tbz x3, #0, 47f\n" + "ld1 { v28.b }[6], [x27]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x3, #0, 47f\n" + "ld1 { v28.b }[4], [x27]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x3, #1, 46f\n" + "ld1 { v28.h }[0], [x27], #0x2\n" + "tbz x3, #0, 47f\n" + "ld1 { v28.b }[2], [x27]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 47f\n" + "ld1 { v28.b }[0], [x27]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr x26, [x7, #0x88]\n" + "smlal v19.4s, v28.4h, v7.4h\n" + "add x26, x26, x4\n" + "smlal2 v20.4s, v28.8h, v7.8h\n" + "smlal v18.4s, v28.4h, v1.4h\n" + "smlal2 v21.4s, v28.8h, v1.8h\n" + "tbz x3, #2, 49f\n" + "ld1 { v26.s }[0], [x26], #0x4\n" + "tbz x3, #1, 48f\n" + "ld1 { v26.h }[2], [x26], #0x2\n" + "tbz x3, #0, 51f\n" + "ld1 { v26.b }[6], [x26]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x3, #0, 51f\n" + "ld1 { v26.b }[4], [x26]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x3, #1, 50f\n" + "ld1 { v26.h }[0], [x26], #0x2\n" + "tbz x3, #0, 51f\n" + "ld1 { v26.b }[2], [x26]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 51f\n" + "ld1 { v26.b }[0], [x26]\n" + "51:" // Oddments: Load (3, 4): Bit 2: End + "usubl v26.8h, v26.8b, v22.8b\n" + "ldr x25, [x7, #0x90]\n" + "smlal v18.4s, v26.4h, v5.4h\n" + "add x25, x25, x4\n" + "smlal2 v21.4s, v26.8h, v5.8h\n" + "tbz x3, #2, 53f\n" + "ld1 { v25.s }[0], [x25], #0x4\n" + "tbz x3, #1, 52f\n" + "ld1 { v25.h }[2], [x25], #0x2\n" + "tbz x3, #0, 55f\n" + "ld1 { v25.b }[6], [x25]\n" + "b 55f\n" + "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x3, #0, 55f\n" + "ld1 { v25.b }[4], [x25]\n" + "b 55f\n" + "53:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x3, #1, 54f\n" + "ld1 { v25.h }[0], [x25], #0x2\n" + "tbz x3, #0, 55f\n" + "ld1 { v25.b }[2], [x25]\n" + "b 55f\n" + "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 55f\n" + "ld1 { v25.b }[0], [x25]\n" + "55:" // Oddments: Load (4, 0): Bit 2: End + "usubl v25.8h, v25.8b, v22.8b\n" + "ldr x24, [x7, #0x98]\n" + "smlal v11.4s, v25.4h, v6.4h\n" + "add x24, x24, x4\n" + "smlal2 v17.4s, v25.8h, v6.8h\n" + "tbz x3, #2, 57f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x3, #1, 56f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x3, #0, 59f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 59f\n" + "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x3, #0, 59f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 59f\n" + "57:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x3, #1, 58f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x3, #0, 59f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 59f\n" + "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 59f\n" + "ld1 { v29.b }[0], [x24]\n" + "59:" // Oddments: Load (2, 4): Bit 2: End + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x23, [x7, #0xa0]\n" + "smlal v19.4s, v29.4h, v8.4h\n" + "add x23, x23, x4\n" + "smlal2 v20.4s, v29.8h, v8.8h\n" + "smlal v18.4s, v29.4h, v2.4h\n" + "smlal2 v21.4s, v29.8h, v2.8h\n" + "tbz x3, #2, 61f\n" + "ld1 { v27.s }[0], [x23], #0x4\n" + "tbz x3, #1, 60f\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "tbz x3, #0, 63f\n" + "ld1 { v27.b }[6], [x23]\n" + "b 63f\n" + "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x3, #0, 63f\n" + "ld1 { v27.b }[4], [x23]\n" + "b 63f\n" + "61:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x3, #1, 62f\n" + "ld1 { v27.h }[0], [x23], #0x2\n" + "tbz x3, #0, 63f\n" + "ld1 { v27.b }[2], [x23]\n" + "b 63f\n" + "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 63f\n" + "ld1 { v27.b }[0], [x23]\n" + "63:" // Oddments: Load (4, 1): Bit 2: End + "usubl v27.8h, v27.8b, v22.8b\n" + "ldr x22, [x7, #0xa8]\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "add x22, x22, x4\n" + "smlal2 v17.4s, v27.8h, v7.8h\n" + "tbz x3, #2, 65f\n" + "ld1 { v24.s }[0], [x22], #0x4\n" + "tbz x3, #1, 64f\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "tbz x3, #0, 67f\n" + "ld1 { v24.b }[6], [x22]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x3, #0, 67f\n" + "ld1 { v24.b }[4], [x22]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x3, #1, 66f\n" + "ld1 { v24.h }[0], [x22], #0x2\n" + "tbz x3, #0, 67f\n" + "ld1 { v24.b }[2], [x22]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 67f\n" + "ld1 { v24.b }[0], [x22]\n" + "67:" // Oddments: Load (3, 2): Bit 2: End + "usubl v24.8h, v24.8b, v22.8b\n" + "ldr x21, [x7, #0xb0]\n" + "smlal v11.4s, v24.4h, v5.4h\n" + "add x21, x21, x4\n" + "smlal2 v17.4s, v24.8h, v5.8h\n" + "smlal v18.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "tbz x3, #2, 69f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x3, #1, 68f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x3, #0, 71f\n" + "ld1 { v26.b }[6], [x21]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x3, #0, 71f\n" + "ld1 { v26.b }[4], [x21]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x3, #1, 70f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x3, #0, 71f\n" + "ld1 { v26.b }[2], [x21]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 71f\n" + "ld1 { v26.b }[0], [x21]\n" + "71:" // Oddments: Load (4, 3): Bit 2: End + "usubl v26.8h, v26.8b, v22.8b\n" + "ldr x20, [x7, #0xb8]\n" + "smlal v18.4s, v26.4h, v7.4h\n" + "add x20, x20, x4\n" + "smlal2 v21.4s, v26.8h, v7.8h\n" + "tbz x3, #2, 73f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x3, #1, 72f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x3, #0, 75f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x3, #0, 75f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x3, #1, 74f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x3, #0, 75f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 75f\n" + "ld1 { v25.b }[0], [x20]\n" + "75:" // Oddments: Load (4, 2): Bit 2: End + "usubl v25.8h, v25.8b, v22.8b\n" + "ldr x19, [x7, #0xc0]\n" + "smlal v11.4s, v25.4h, v8.4h\n" + "add x19, x19, x4\n" + "smlal2 v17.4s, v25.8h, v8.8h\n" + "smlal v18.4s, v25.4h, v6.4h\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "tbz x3, #2, 77f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x3, #1, 76f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x3, #0, 79f\n" + "ld1 { v29.b }[6], [x19]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x3, #0, 79f\n" + "ld1 { v29.b }[4], [x19]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x3, #1, 78f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x3, #0, 79f\n" + "ld1 { v29.b }[2], [x19]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 79f\n" + "ld1 { v29.b }[0], [x19]\n" + "79:" // Oddments: Load (4, 4): Bit 2: End + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v8.4h\n" + "smlal2 v21.4s, v29.8h, v8.8h\n" + "tbz x3, #2, 81f\n" + "ld1 { v31.4s }, [x8], #0x10\n" + "ld1 { v30.4s }, [x16], #0x10\n" + "tbz x3, #1, 80f\n" + "ld1 { v23.d }[0], [x8], #0x8\n" + "ld1 { v9.d }[0], [x16], #0x8\n" + "tbz x3, #0, 83f\n" + "ld1 { v23.s }[2], [x8]\n" + "ld1 { v9.s }[2], [x16]\n" + "b 83f\n" + "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x3, #0, 83f\n" + "ld1 { v23.s }[0], [x8]\n" + "ld1 { v9.s }[0], [x16]\n" + "b 83f\n" + "81:" // Oddments: Load requant params: Bit 2: Unset + "tbz x3, #1, 82f\n" + "ld1 { v31.d }[0], [x8], #0x8\n" + "ld1 { v30.d }[0], [x16], #0x8\n" + "tbz x3, #0, 83f\n" + "ld1 { v31.s }[2], [x8]\n" + "ld1 { v30.s }[2], [x16]\n" + "b 83f\n" + "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 83f\n" + "ld1 { v31.s }[0], [x8]\n" + "ld1 { v30.s }[0], [x16]\n" + "83:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v13.4s, v13.4s, v31.4s\n" + "add x15, x15, x6\n" + "sqrdmulh v10.4s, v10.4s, v23.4s\n" + "add x14, x14, x6\n" + "sqrdmulh v19.4s, v19.4s, v31.4s\n" + "add x13, x13, x6\n" + "sqrdmulh v20.4s, v20.4s, v23.4s\n" + "add x12, x12, x6\n" + "sqrdmulh v11.4s, v11.4s, v31.4s\n" + "and v27.16b, v13.16b, v30.16b\n" + "and v7.16b, v10.16b, v9.16b\n" + "and v6.16b, v19.16b, v30.16b\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "sshr v6.4s, v6.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v27.4s\n" + "sqadd v10.4s, v10.4s, v7.4s\n" + "sqadd v19.4s, v19.4s, v6.4s\n" + "and v3.16b, v20.16b, v9.16b\n" + "srshl v13.4s, v13.4s, v30.4s\n" + "srshl v10.4s, v10.4s, v9.4s\n" + "srshl v19.4s, v19.4s, v30.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "add v13.4s, v13.4s, v14.4s\n" + "add v10.4s, v10.4s, v14.4s\n" + "add v19.4s, v19.4s, v14.4s\n" + "smin v13.4s, v13.4s, v15.4s\n" + "smin v10.4s, v10.4s, v15.4s\n" + "smin v19.4s, v19.4s, v15.4s\n" + "smax v13.4s, v13.4s, v16.4s\n" + "smax v10.4s, v10.4s, v16.4s\n" + "smax v19.4s, v19.4s, v16.4s\n" + "sqadd v20.4s, v20.4s, v3.4s\n" + "uzp1 v13.16b, v13.16b, v10.16b\n" + "and v28.16b, v11.16b, v30.16b\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "srshl v20.4s, v20.4s, v9.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sqrdmulh v17.4s, v17.4s, v23.4s\n" + "sqrdmulh v18.4s, v18.4s, v31.4s\n" + "add v20.4s, v20.4s, v14.4s\n" + "sqadd v11.4s, v11.4s, v28.4s\n" + "and v26.16b, v17.16b, v9.16b\n" + "smin v20.4s, v20.4s, v15.4s\n" + "and v8.16b, v18.16b, v30.16b\n" + "srshl v11.4s, v11.4s, v30.4s\n" + "smax v20.4s, v20.4s, v16.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v8.4s, v8.4s, #0x1f\n" + "uzp1 v19.16b, v19.16b, v20.16b\n" + "add v11.4s, v11.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "sqadd v17.4s, v17.4s, v26.4s\n" + "smin v11.4s, v11.4s, v15.4s\n" + "sqadd v18.4s, v18.4s, v8.4s\n" + "sqrdmulh v21.4s, v21.4s, v23.4s\n" + "smax v11.4s, v11.4s, v16.4s\n" + "srshl v17.4s, v17.4s, v9.4s\n" + "srshl v18.4s, v18.4s, v30.4s\n" + "and v27.16b, v21.16b, v9.16b\n" + "add v17.4s, v17.4s, v14.4s\n" + "add v18.4s, v18.4s, v14.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "smin v17.4s, v17.4s, v15.4s\n" + "smin v18.4s, v18.4s, v15.4s\n" + "sqadd v21.4s, v21.4s, v27.4s\n" + "smax v17.4s, v17.4s, v16.4s\n" + "smax v18.4s, v18.4s, v16.4s\n" + "srshl v21.4s, v21.4s, v9.4s\n" + "uzp1 v11.16b, v11.16b, v17.16b\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "add v21.4s, v21.4s, v14.4s\n" + "smin v21.4s, v21.4s, v15.4s\n" + "smax v21.4s, v21.4s, v16.4s\n" + "uzp1 v18.16b, v18.16b, v21.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "tbz x3, #2, 85f\n" + "st1 { v13.s }[0], [x15], #0x4\n" + "st1 { v19.s }[0], [x14], #0x4\n" + "st1 { v11.s }[0], [x13], #0x4\n" + "st1 { v18.s }[0], [x12], #0x4\n" + "tbz x3, #1, 84f\n" + "st1 { v13.h }[2], [x15], #0x2\n" + "st1 { v19.h }[2], [x14], #0x2\n" + "st1 { v11.h }[2], [x13], #0x2\n" + "st1 { v18.h }[2], [x12], #0x2\n" + "tbz x3, #0, 87f\n" + "st1 { v13.b }[6], [x15], #0x1\n" + "st1 { v19.b }[6], [x14], #0x1\n" + "st1 { v11.b }[6], [x13], #0x1\n" + "st1 { v18.b }[6], [x12], #0x1\n" + "b 87f\n" + "84:" // Oddments: Bit 2: Bit 1: Unset + "tbz x3, #0, 87f\n" + "st1 { v13.b }[4], [x15], #0x1\n" + "st1 { v19.b }[4], [x14], #0x1\n" + "st1 { v11.b }[4], [x13], #0x1\n" + "st1 { v18.b }[4], [x12], #0x1\n" + "b 87f\n" + "85:" // Oddments: Bit 2: Unset + "tbz x3, #1, 86f\n" + "st1 { v13.h }[0], [x15], #0x2\n" + "st1 { v19.h }[0], [x14], #0x2\n" + "st1 { v11.h }[0], [x13], #0x2\n" + "st1 { v18.h }[0], [x12], #0x2\n" + "tbz x3, #0, 87f\n" + "st1 { v13.b }[2], [x15], #0x1\n" + "st1 { v19.b }[2], [x14], #0x1\n" + "st1 { v11.b }[2], [x13], #0x1\n" + "st1 { v18.b }[2], [x12], #0x1\n" + "b 87f\n" + "86:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x3, #0, 87f\n" + "st1 { v13.b }[0], [x15], #0x1\n" + "st1 { v19.b }[0], [x14], #0x1\n" + "st1 { v11.b }[0], [x13], #0x1\n" + "st1 { v18.b }[0], [x12], #0x1\n" + "87:" // Oddments: Bit 2: End + + "88:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..d3d5000d4c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size; + + kern_type kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + + a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..97156137bf --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,2213 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "mov x10, #0x0\n" + "ldr x3, [%x[params], %[offsetof_Params_weights]]\n" + "mov x1, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "add x25, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n" + "lsr x19, x4, #0x3\n" + "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x13, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1r { v9.16b }, [x13]\n" + "add x8, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1r { v14.16b }, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1r { v10.4s }, [x8]\n" + "add x8, x22, %[offsetof_Requantize32_maxval]\n" + "ld1r { v11.4s }, [x20]\n" + "ld1r { v13.4s }, [x8]\n" + "ldp x17, x16, [x21, #0x0]\n" + "ldp x6, x8, [x21, #0x10]\n" + "cbz x19, 3f\n" + "subs x19, x19, #0x1\n" + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x12, #0x0]\n" + "mov v16.16b, v15.16b\n" + "ldr q18, [x12, #0x10]\n" + "add x12, x12, #0x20\n" + "mov v7.16b, v15.16b\n" + "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "mov v8.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "ldr d1, [x3, #0x8]\n" + "mov v21.16b, v18.16b\n" + "ldr d2, [x3, #0x10]\n" + "mov v17.16b, v18.16b\n" + "ldr d3, [x3, #0x18]\n" + "mov v5.16b, v18.16b\n" + "ldr d4, [x3, #0x20]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldp x28, x27, [x25, #0x0]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "ldr d31, [x28, x10]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr d30, [x27, x10]\n" + "ldr d29, [x26, x10]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d28, [x13, x10]\n" + "usubl v29.8h, v29.8b, v9.8b\n" + "ldr d27, [x24, x10]\n" + "ldr d23, [x23, x10]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr d25, [x22, x10]\n" + "ldr d24, [x21, x10]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "ldr d26, [x20, x10]\n" + "ldr d22, [x0, x10]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "beq 2f\n" + "1:" // Loop + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x20, [x25, #0x50]\n" + "subs x19, x19, #0x1\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr x28, [x25, #0x58]\n" + "smlal v16.4s, v30.4h, v0.4h\n" + "ldr x0, [x25, #0x60]\n" + "smlal2 v21.4s, v30.8h, v0.8h\n" + "ldr d31, [x20, x10]\n" + "smlal v7.4s, v29.4h, v0.4h\n" + "ldr x7, [x25, #0x68]\n" + "smlal2 v17.4s, v29.8h, v0.8h\n" + "ldr x26, [x25, #0x70]\n" + "smlal v8.4s, v28.4h, v0.4h\n" + "ldr x23, [x25, #0x78]\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d0, [x3, #0x28]\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "ldr x20, [x25, #0x80]\n" + "smlal2 v18.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "smlal v16.4s, v27.4h, v1.4h\n" + "ldr x22, [x25, #0x88]\n" + "smlal2 v21.4s, v27.8h, v1.8h\n" + "ldr x13, [x25, #0x90]\n" + "smlal v7.4s, v28.4h, v1.4h\n" + "ldr x21, [x25, #0x98]\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "ldr x14, [x25, #0xa0]\n" + "smlal v8.4s, v23.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d1, [x3, #0x30]\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "smlal2 v18.4s, v27.8h, v2.8h\n" + "ldr d27, [x0, x10]\n" + "smlal v16.4s, v25.4h, v2.4h\n" + "ldr x0, [x25, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v2.8h\n" + "ldr x15, [x25, #0xc0]\n" + "smlal v7.4s, v23.4h, v2.4h\n" + "ldr x9, [x25, #0xc8]\n" + "smlal2 v17.4s, v23.8h, v2.8h\n" + "ldr x27, [x25, #0xd0]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x28, [x25, #0xd8]\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr q6, [x2, #0x0]\n" + "smlal2 v18.4s, v25.8h, v3.8h\n" + "ldr d25, [x7, x10]\n" + "smlal v8.4s, v31.4h, v2.4h\n" + "ldr x12, [x25, #0xe0]\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "ldr d2, [x3, #0x38]\n" + "smlal v16.4s, v24.4h, v3.4h\n" + "ldr q19, [x5, #0x0]\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "ldr q20, [x2, #0x10]\n" + "add x2, x2, #0x20\n" + "smlal v7.4s, v31.4h, v3.4h\n" + "ldr q12, [x5, #0x10]\n" + "add x5, x5, #0x20\n" + "smlal2 v17.4s, v31.8h, v3.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v18.4s, v24.8h, v4.8h\n" + "ldr d24, [x26, x10]\n" + "smlal v8.4s, v30.4h, v3.4h\n" + "ldr x7, [x25, #0xe8]\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "ldr d3, [x3, #0x40]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v7.4s, v30.4h, v4.4h\n" + "smlal2 v17.4s, v30.8h, v4.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x10]\n" + "smlal v8.4s, v26.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0x48]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v18.4s, v29.8h, v0.8h\n" + "smlal v16.4s, v28.4h, v0.4h\n" + "smlal2 v21.4s, v28.8h, v0.8h\n" + "smlal v7.4s, v22.4h, v0.4h\n" + "smlal2 v17.4s, v22.8h, v0.8h\n" + "smlal v8.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d0, [x3, #0x50]\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x10]\n" + "smlal v16.4s, v23.4h, v1.4h\n" + "ldr x23, [x25, #0xf8]\n" + "smlal2 v21.4s, v23.8h, v1.8h\n" + "smlal v7.4s, v25.4h, v1.4h\n" + "smlal2 v17.4s, v25.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v8.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "ldr d1, [x3, #0x58]\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v18.4s, v23.8h, v2.8h\n" + "ldr d23, [x20, x10]\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ldr x22, [x25, #0x100]\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v7.4s, v24.4h, v2.4h\n" + "smlal2 v17.4s, v24.8h, v2.8h\n" + "smlal v8.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d2, [x3, #0x60]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v18.4s, v31.8h, v3.8h\n" + "ldr d31, [x13, x10]\n" + "smlal v16.4s, v30.4h, v3.4h\n" + "ldr x20, [x25, #0x108]\n" + "smlal2 v21.4s, v30.8h, v3.8h\n" + "smlal v7.4s, v27.4h, v3.4h\n" + "smlal2 v17.4s, v27.8h, v3.8h\n" + "smlal v8.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "ldr d3, [x3, #0x68]\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v18.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x10]\n" + "smlal v16.4s, v26.4h, v4.4h\n" + "ldr x13, [x25, #0x110]\n" + "smlal2 v21.4s, v26.8h, v4.8h\n" + "ldr d26, [x14, x10]\n" + "smlal v7.4s, v23.4h, v4.4h\n" + "ldr x21, [x25, #0x118]\n" + "smlal2 v17.4s, v23.8h, v4.8h\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v8.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x3, #0x70]\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v18.4s, v22.8h, v0.8h\n" + "ldr d22, [x0, x10]\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal v7.4s, v31.4h, v0.4h\n" + "smlal2 v17.4s, v31.8h, v0.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v8.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ldr d0, [x3, #0x78]\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v18.4s, v25.8h, v1.8h\n" + "ldr d25, [x11, x10]\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "smlal v7.4s, v30.4h, v1.4h\n" + "smlal2 v17.4s, v30.8h, v1.8h\n" + "smlal v8.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "ldr d1, [x3, #0x80]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v18.4s, v24.8h, v2.8h\n" + "ldr d24, [x24, x10]\n" + "smlal v16.4s, v27.4h, v2.4h\n" + "smlal2 v21.4s, v27.8h, v2.8h\n" + "smlal v7.4s, v26.4h, v2.4h\n" + "smlal2 v17.4s, v26.8h, v2.8h\n" + "smlal v8.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d2, [x3, #0x88]\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v18.4s, v27.8h, v3.8h\n" + "ldr d27, [x15, x10]\n" + "smlal v16.4s, v23.4h, v3.4h\n" + "smlal2 v21.4s, v23.8h, v3.8h\n" + "smlal v7.4s, v25.4h, v3.4h\n" + "smlal2 v17.4s, v25.8h, v3.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal v8.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ldr d3, [x3, #0x90]\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v18.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x10]\n" + "smlal v16.4s, v28.4h, v4.4h\n" + "smlal2 v21.4s, v28.8h, v4.8h\n" + "ldr d28, [x12, x10]\n" + "smlal v7.4s, v24.4h, v4.4h\n" + "smlal2 v17.4s, v24.8h, v4.8h\n" + "smlal v8.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "ldr d4, [x3, #0x98]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d31, [x27, x10]\n" + "smlal v16.4s, v30.4h, v0.4h\n" + "smlal2 v21.4s, v30.8h, v0.8h\n" + "smlal v7.4s, v27.4h, v0.4h\n" + "smlal2 v17.4s, v27.8h, v0.8h\n" + "smlal v8.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "ldr d0, [x3, #0xa0]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v18.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "smlal v16.4s, v26.4h, v1.4h\n" + "smlal2 v21.4s, v26.8h, v1.8h\n" + "smlal v7.4s, v23.4h, v1.4h\n" + "smlal2 v17.4s, v23.8h, v1.8h\n" + "smlal v8.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d1, [x3, #0xa8]\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v18.4s, v26.8h, v2.8h\n" + "ldr d26, [x7, x10]\n" + "smlal v16.4s, v25.4h, v2.4h\n" + "smlal2 v21.4s, v25.8h, v2.8h\n" + "smlal v7.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "smlal v8.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d2, [x3, #0xb0]\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v18.4s, v25.8h, v3.8h\n" + "ldr d25, [x26, x10]\n" + "smlal v16.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "smlal v7.4s, v30.4h, v3.4h\n" + "smlal2 v17.4s, v30.8h, v3.8h\n" + "smlal v8.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "ldr d3, [x3, #0xb8]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v18.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x10]\n" + "smlal v16.4s, v22.4h, v4.4h\n" + "smlal2 v21.4s, v22.8h, v4.8h\n" + "smlal v7.4s, v28.4h, v4.4h\n" + "smlal2 v17.4s, v28.8h, v4.8h\n" + "smlal v8.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0xc0]\n" + "add x3, x3, #0xc8\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v18.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x10]\n" + "smlal v16.4s, v23.4h, v0.4h\n" + "smlal2 v21.4s, v23.8h, v0.8h\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v7.4s, v25.4h, v0.4h\n" + "smlal2 v17.4s, v25.8h, v0.8h\n" + "ldr d25, [x20, x10]\n" + "smlal v8.4s, v24.4h, v0.4h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v18.4s, v23.8h, v1.8h\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "smlal v7.4s, v24.4h, v1.4h\n" + "smlal2 v17.4s, v24.8h, v1.8h\n" + "ldr d24, [x13, x10]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v8.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v18.4s, v31.8h, v2.8h\n" + "smlal v16.4s, v30.4h, v2.4h\n" + "smlal2 v21.4s, v30.8h, v2.8h\n" + "smlal v7.4s, v27.4h, v2.4h\n" + "smlal2 v17.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x10]\n" + "add x10, x10, #0x8\n" + "smlal v8.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v18.4s, v30.8h, v3.8h\n" + "smlal v16.4s, v28.4h, v3.4h\n" + "smlal2 v21.4s, v28.8h, v3.8h\n" + "smlal v7.4s, v25.4h, v3.4h\n" + "smlal2 v17.4s, v25.8h, v3.8h\n" + "smlal v8.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v18.4s, v28.8h, v4.8h\n" + "smlal v16.4s, v26.4h, v4.4h\n" + "smlal2 v21.4s, v26.8h, v4.8h\n" + "smlal v7.4s, v24.4h, v4.4h\n" + "smlal2 v17.4s, v24.8h, v4.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "smlal v8.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "and v28.16b, v15.16b, v19.16b\n" + "and v26.16b, v18.16b, v12.16b\n" + "sqrdmulh v16.4s, v16.4s, v6.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "sqadd v15.4s, v15.4s, v28.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "and v29.16b, v16.16b, v19.16b\n" + "and v4.16b, v21.16b, v12.16b\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "srshl v18.4s, v18.4s, v12.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "add v15.4s, v15.4s, v10.4s\n" + "add v18.4s, v18.4s, v10.4s\n" + "sqadd v16.4s, v16.4s, v29.4s\n" + "smin v15.4s, v15.4s, v13.4s\n" + "smin v18.4s, v18.4s, v13.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "smax v15.4s, v15.4s, v11.4s\n" + "smax v18.4s, v18.4s, v11.4s\n" + "srshl v16.4s, v16.4s, v19.4s\n" + "srshl v21.4s, v21.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v18.16b\n" + "sqrdmulh v7.4s, v7.4s, v6.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x17, x1]\n" + "add v16.4s, v16.4s, v10.4s\n" + "add v21.4s, v21.4s, v10.4s\n" + "and v25.16b, v7.16b, v19.16b\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "smin v16.4s, v16.4s, v13.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smax v16.4s, v16.4s, v11.4s\n" + "smax v21.4s, v21.4s, v11.4s\n" + "sqadd v7.4s, v7.4s, v25.4s\n" + "and v31.16b, v17.16b, v12.16b\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "sqrdmulh v8.4s, v8.4s, v6.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x16, x1]\n" + "srshl v7.4s, v7.4s, v19.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "and v24.16b, v8.16b, v19.16b\n" + "sqrdmulh v5.4s, v5.4s, v20.4s\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "add v7.4s, v7.4s, v10.4s\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "and v1.16b, v5.16b, v12.16b\n" + "smin v7.4s, v7.4s, v13.4s\n" + "srshl v17.4s, v17.4s, v12.4s\n" + "sqadd v8.4s, v8.4s, v24.4s\n" + "smax v7.4s, v7.4s, v11.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "add v17.4s, v17.4s, v10.4s\n" + "srshl v8.4s, v8.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v1.4s\n" + "smin v17.4s, v17.4s, v13.4s\n" + "add v8.4s, v8.4s, v10.4s\n" + "smax v17.4s, v17.4s, v11.4s\n" + "srshl v5.4s, v5.4s, v12.4s\n" + "smin v8.4s, v8.4s, v13.4s\n" + "uzp1 v7.16b, v7.16b, v17.16b\n" + "add v5.4s, v5.4s, v10.4s\n" + "uzp1 v7.16b, v7.16b, v7.16b\n" + "str d7, [x6, x1]\n" + "smax v8.4s, v8.4s, v11.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "smax v5.4s, v5.4s, v11.4s\n" + "uzp1 v8.16b, v8.16b, v5.16b\n" + "uzp1 v8.16b, v8.16b, v8.16b\n" + "str d8, [x8, x1]\n" + "add x1, x1, #0x8\n" + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x12, #0x0]\n" + "mov v16.16b, v15.16b\n" + "ldr q18, [x12, #0x10]\n" + "add x12, x12, #0x20\n" + "mov v7.16b, v15.16b\n" + "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "mov v8.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "ldr d1, [x3, #0x8]\n" + "mov v21.16b, v18.16b\n" + "ldr d2, [x3, #0x10]\n" + "mov v17.16b, v18.16b\n" + "ldr d3, [x3, #0x18]\n" + "mov v5.16b, v18.16b\n" + "ldr d4, [x3, #0x20]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldp x28, x27, [x25, #0x0]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "ldr d31, [x28, x10]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr d30, [x27, x10]\n" + "ldr d29, [x26, x10]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d28, [x13, x10]\n" + "usubl v29.8h, v29.8b, v9.8b\n" + "ldr d27, [x24, x10]\n" + "ldr d23, [x23, x10]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr d25, [x22, x10]\n" + "ldr d24, [x21, x10]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "ldr d26, [x20, x10]\n" + "ldr d22, [x0, x10]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "bgt 1b\n" + "2:" // Tail + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x20, [x25, #0x50]\n" + "tst x4, #0x7\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr x28, [x25, #0x58]\n" + "smlal v16.4s, v30.4h, v0.4h\n" + "ldr x0, [x25, #0x60]\n" + "smlal2 v21.4s, v30.8h, v0.8h\n" + "ldr d31, [x20, x10]\n" + "smlal v7.4s, v29.4h, v0.4h\n" + "ldr x7, [x25, #0x68]\n" + "smlal2 v17.4s, v29.8h, v0.8h\n" + "ldr x26, [x25, #0x70]\n" + "smlal v8.4s, v28.4h, v0.4h\n" + "ldr x23, [x25, #0x78]\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d0, [x3, #0x28]\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "ldr x20, [x25, #0x80]\n" + "smlal2 v18.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "smlal v16.4s, v27.4h, v1.4h\n" + "ldr x22, [x25, #0x88]\n" + "smlal2 v21.4s, v27.8h, v1.8h\n" + "ldr x13, [x25, #0x90]\n" + "smlal v7.4s, v28.4h, v1.4h\n" + "ldr x21, [x25, #0x98]\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "ldr x14, [x25, #0xa0]\n" + "smlal v8.4s, v23.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d1, [x3, #0x30]\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "smlal2 v18.4s, v27.8h, v2.8h\n" + "ldr d27, [x0, x10]\n" + "smlal v16.4s, v25.4h, v2.4h\n" + "ldr x0, [x25, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v2.8h\n" + "ldr x15, [x25, #0xc0]\n" + "smlal v7.4s, v23.4h, v2.4h\n" + "ldr x9, [x25, #0xc8]\n" + "smlal2 v17.4s, v23.8h, v2.8h\n" + "ldr x27, [x25, #0xd0]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x28, [x25, #0xd8]\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "ldr x12, [x25, #0xe0]\n" + "smlal2 v18.4s, v25.8h, v3.8h\n" + "ldr d25, [x7, x10]\n" + "smlal v8.4s, v31.4h, v2.4h\n" + "ldr x7, [x25, #0xe8]\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "ldr d2, [x3, #0x38]\n" + "smlal v16.4s, v24.4h, v3.4h\n" + "ldr q6, [x2, #0x0]\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "ldr q19, [x5, #0x0]\n" + "smlal v7.4s, v31.4h, v3.4h\n" + "ldr q20, [x2, #0x10]\n" + "add x2, x2, #0x20\n" + "smlal2 v17.4s, v31.8h, v3.8h\n" + "ldr q12, [x5, #0x10]\n" + "add x5, x5, #0x20\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v18.4s, v24.8h, v4.8h\n" + "ldr d24, [x26, x10]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr x26, [x25, #0xf0]\n" + "smlal v8.4s, v30.4h, v3.4h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "ldr d3, [x3, #0x40]\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "ldr d27, [x23, x10]\n" + "smlal v7.4s, v30.4h, v4.4h\n" + "ldr x23, [x25, #0xf8]\n" + "smlal2 v17.4s, v30.8h, v4.8h\n" + "smlal v8.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0x48]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v18.4s, v29.8h, v0.8h\n" + "smlal v16.4s, v28.4h, v0.4h\n" + "smlal2 v21.4s, v28.8h, v0.8h\n" + "smlal v7.4s, v22.4h, v0.4h\n" + "smlal2 v17.4s, v22.8h, v0.8h\n" + "smlal v8.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d0, [x3, #0x50]\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "ldr d28, [x22, x10]\n" + "smlal v16.4s, v23.4h, v1.4h\n" + "ldr x22, [x25, #0x100]\n" + "smlal2 v21.4s, v23.8h, v1.8h\n" + "smlal v7.4s, v25.4h, v1.4h\n" + "smlal2 v17.4s, v25.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v8.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "ldr d1, [x3, #0x58]\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v18.4s, v23.8h, v2.8h\n" + "ldr d23, [x20, x10]\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "ldr x20, [x25, #0x108]\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v7.4s, v24.4h, v2.4h\n" + "smlal2 v17.4s, v24.8h, v2.8h\n" + "smlal v8.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d2, [x3, #0x60]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v18.4s, v31.8h, v3.8h\n" + "ldr d31, [x13, x10]\n" + "smlal v16.4s, v30.4h, v3.4h\n" + "ldr x13, [x25, #0x110]\n" + "smlal2 v21.4s, v30.8h, v3.8h\n" + "smlal v7.4s, v27.4h, v3.4h\n" + "smlal2 v17.4s, v27.8h, v3.8h\n" + "smlal v8.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "ldr d3, [x3, #0x68]\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v18.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x10]\n" + "smlal v16.4s, v26.4h, v4.4h\n" + "ldr x21, [x25, #0x118]\n" + "smlal2 v21.4s, v26.8h, v4.8h\n" + "ldr d26, [x14, x10]\n" + "smlal v7.4s, v23.4h, v4.4h\n" + "smlal2 v17.4s, v23.8h, v4.8h\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v8.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x3, #0x70]\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v18.4s, v22.8h, v0.8h\n" + "ldr d22, [x0, x10]\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal v7.4s, v31.4h, v0.4h\n" + "smlal2 v17.4s, v31.8h, v0.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v8.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ldr d0, [x3, #0x78]\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v18.4s, v25.8h, v1.8h\n" + "ldr d25, [x11, x10]\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "smlal v7.4s, v30.4h, v1.4h\n" + "smlal2 v17.4s, v30.8h, v1.8h\n" + "smlal v8.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "ldr d1, [x3, #0x80]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v18.4s, v24.8h, v2.8h\n" + "ldr d24, [x24, x10]\n" + "smlal v16.4s, v27.4h, v2.4h\n" + "smlal2 v21.4s, v27.8h, v2.8h\n" + "smlal v7.4s, v26.4h, v2.4h\n" + "smlal2 v17.4s, v26.8h, v2.8h\n" + "smlal v8.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d2, [x3, #0x88]\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v18.4s, v27.8h, v3.8h\n" + "ldr d27, [x15, x10]\n" + "smlal v16.4s, v23.4h, v3.4h\n" + "smlal2 v21.4s, v23.8h, v3.8h\n" + "smlal v7.4s, v25.4h, v3.4h\n" + "smlal2 v17.4s, v25.8h, v3.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal v8.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ldr d3, [x3, #0x90]\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v18.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x10]\n" + "smlal v16.4s, v28.4h, v4.4h\n" + "smlal2 v21.4s, v28.8h, v4.8h\n" + "ldr d28, [x12, x10]\n" + "smlal v7.4s, v24.4h, v4.4h\n" + "smlal2 v17.4s, v24.8h, v4.8h\n" + "smlal v8.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "ldr d4, [x3, #0x98]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d31, [x27, x10]\n" + "smlal v16.4s, v30.4h, v0.4h\n" + "smlal2 v21.4s, v30.8h, v0.8h\n" + "smlal v7.4s, v27.4h, v0.4h\n" + "smlal2 v17.4s, v27.8h, v0.8h\n" + "smlal v8.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "ldr d0, [x3, #0xa0]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v18.4s, v30.8h, v1.8h\n" + "ldr d30, [x28, x10]\n" + "smlal v16.4s, v26.4h, v1.4h\n" + "smlal2 v21.4s, v26.8h, v1.8h\n" + "smlal v7.4s, v23.4h, v1.4h\n" + "smlal2 v17.4s, v23.8h, v1.8h\n" + "smlal v8.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d1, [x3, #0xa8]\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v18.4s, v26.8h, v2.8h\n" + "ldr d26, [x7, x10]\n" + "smlal v16.4s, v25.4h, v2.4h\n" + "smlal2 v21.4s, v25.8h, v2.8h\n" + "smlal v7.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "smlal v8.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d2, [x3, #0xb0]\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v18.4s, v25.8h, v3.8h\n" + "ldr d25, [x26, x10]\n" + "smlal v16.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "smlal v7.4s, v30.4h, v3.4h\n" + "smlal2 v17.4s, v30.8h, v3.8h\n" + "smlal v8.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "ldr d3, [x3, #0xb8]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v18.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x10]\n" + "smlal v16.4s, v22.4h, v4.4h\n" + "smlal2 v21.4s, v22.8h, v4.8h\n" + "smlal v7.4s, v28.4h, v4.4h\n" + "smlal2 v17.4s, v28.8h, v4.8h\n" + "smlal v8.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x3, #0xc0]\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v18.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x10]\n" + "smlal v16.4s, v23.4h, v0.4h\n" + "smlal2 v21.4s, v23.8h, v0.8h\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v7.4s, v25.4h, v0.4h\n" + "smlal2 v17.4s, v25.8h, v0.8h\n" + "ldr d25, [x20, x10]\n" + "smlal v8.4s, v24.4h, v0.4h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v18.4s, v23.8h, v1.8h\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "smlal v7.4s, v24.4h, v1.4h\n" + "smlal2 v17.4s, v24.8h, v1.8h\n" + "ldr d24, [x13, x10]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v8.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v18.4s, v31.8h, v2.8h\n" + "smlal v16.4s, v30.4h, v2.4h\n" + "smlal2 v21.4s, v30.8h, v2.8h\n" + "smlal v7.4s, v27.4h, v2.4h\n" + "smlal2 v17.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x10]\n" + "add x10, x10, #0x8\n" + "smlal v8.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v18.4s, v30.8h, v3.8h\n" + "smlal v16.4s, v28.4h, v3.4h\n" + "smlal2 v21.4s, v28.8h, v3.8h\n" + "smlal v7.4s, v25.4h, v3.4h\n" + "smlal2 v17.4s, v25.8h, v3.8h\n" + "smlal v8.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v18.4s, v28.8h, v4.8h\n" + "smlal v16.4s, v26.4h, v4.4h\n" + "smlal2 v21.4s, v26.8h, v4.8h\n" + "smlal v7.4s, v24.4h, v4.4h\n" + "smlal2 v17.4s, v24.8h, v4.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "smlal v8.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "and v28.16b, v15.16b, v19.16b\n" + "and v26.16b, v18.16b, v12.16b\n" + "sqrdmulh v16.4s, v16.4s, v6.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "sqadd v15.4s, v15.4s, v28.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "and v29.16b, v16.16b, v19.16b\n" + "and v4.16b, v21.16b, v12.16b\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "srshl v18.4s, v18.4s, v12.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "add v15.4s, v15.4s, v10.4s\n" + "add v18.4s, v18.4s, v10.4s\n" + "sqadd v16.4s, v16.4s, v29.4s\n" + "smin v15.4s, v15.4s, v13.4s\n" + "smin v18.4s, v18.4s, v13.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "smax v15.4s, v15.4s, v11.4s\n" + "smax v18.4s, v18.4s, v11.4s\n" + "srshl v16.4s, v16.4s, v19.4s\n" + "srshl v21.4s, v21.4s, v12.4s\n" + "uzp1 v15.16b, v15.16b, v18.16b\n" + "sqrdmulh v7.4s, v7.4s, v6.4s\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x17, x1]\n" + "add v16.4s, v16.4s, v10.4s\n" + "add v21.4s, v21.4s, v10.4s\n" + "and v25.16b, v7.16b, v19.16b\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "smin v16.4s, v16.4s, v13.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "smax v16.4s, v16.4s, v11.4s\n" + "smax v21.4s, v21.4s, v11.4s\n" + "sqadd v7.4s, v7.4s, v25.4s\n" + "and v31.16b, v17.16b, v12.16b\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "sqrdmulh v8.4s, v8.4s, v6.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "str d16, [x16, x1]\n" + "srshl v7.4s, v7.4s, v19.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "and v24.16b, v8.16b, v19.16b\n" + "sqrdmulh v5.4s, v5.4s, v20.4s\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "add v7.4s, v7.4s, v10.4s\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "and v1.16b, v5.16b, v12.16b\n" + "smin v7.4s, v7.4s, v13.4s\n" + "srshl v17.4s, v17.4s, v12.4s\n" + "sqadd v8.4s, v8.4s, v24.4s\n" + "smax v7.4s, v7.4s, v11.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "add v17.4s, v17.4s, v10.4s\n" + "srshl v8.4s, v8.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v1.4s\n" + "smin v17.4s, v17.4s, v13.4s\n" + "add v8.4s, v8.4s, v10.4s\n" + "smax v17.4s, v17.4s, v11.4s\n" + "srshl v5.4s, v5.4s, v12.4s\n" + "smin v8.4s, v8.4s, v13.4s\n" + "uzp1 v7.16b, v7.16b, v17.16b\n" + "add v5.4s, v5.4s, v10.4s\n" + "uzp1 v7.16b, v7.16b, v7.16b\n" + "str d7, [x6, x1]\n" + "smax v8.4s, v8.4s, v11.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "smax v5.4s, v5.4s, v11.4s\n" + "uzp1 v8.16b, v8.16b, v5.16b\n" + "uzp1 v8.16b, v8.16b, v8.16b\n" + "str d8, [x8, x1]\n" + "add x1, x1, #0x8\n" + "beq 124f\n" + "add x3, x3, #0xc8\n" + "3:" // Oddments + "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x4, #2, 5f\n" + "ld1 { v15.4s }, [x12], #0x10\n" + "tbz x4, #1, 4f\n" + "ld1 { v18.d }[0], [x12], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v18.s }[2], [x12]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v18.s }[0], [x12]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x4, #1, 6f\n" + "ld1 { v15.d }[0], [x12], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[2], [x12]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v15.s }[0], [x12]\n" + "7:" // Oddments: Load bias: Bit 2: End + "mov v16.16b, v15.16b\n" + "ldr d0, [x3, #0x0]\n" + "mov v21.16b, v18.16b\n" + "ldr d1, [x3, #0x8]\n" + "mov v7.16b, v15.16b\n" + "ldr d2, [x3, #0x10]\n" + "mov v17.16b, v18.16b\n" + "ldr d3, [x3, #0x18]\n" + "mov v8.16b, v15.16b\n" + "ldr d4, [x3, #0x20]\n" + "mov v5.16b, v18.16b\n" + "ldp x28, x27, [x25, #0x0]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldp x26, x13, [x25, #0x10]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ldp x24, x23, [x25, #0x20]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldp x22, x21, [x25, #0x30]\n" + "ldp x20, x0, [x25, #0x40]\n" + "add x28, x28, x10\n" + "add x27, x27, x10\n" + "add x26, x26, x10\n" + "add x13, x13, x10\n" + "add x24, x24, x10\n" + "add x23, x23, x10\n" + "add x22, x22, x10\n" + "add x21, x21, x10\n" + "add x20, x20, x10\n" + "add x0, x0, x10\n" + "tbz x4, #2, 9f\n" + "ld1 { v31.s }[0], [x28], #0x4\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v29.s }[0], [x26], #0x4\n" + "ld1 { v28.s }[0], [x13], #0x4\n" + "ld1 { v27.s }[0], [x24], #0x4\n" + "ld1 { v23.s }[0], [x23], #0x4\n" + "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v24.s }[0], [x21], #0x4\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "ld1 { v22.s }[0], [x0], #0x4\n" + "tbz x4, #1, 8f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x26], #0x2\n" + "ld1 { v28.h }[2], [x13], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v24.h }[2], [x21], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v22.h }[2], [x0], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[6], [x28]\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x26]\n" + "ld1 { v28.b }[6], [x13]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v24.b }[6], [x21]\n" + "ld1 { v26.b }[6], [x20]\n" + "ld1 { v22.b }[6], [x0]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[4], [x28]\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x26]\n" + "ld1 { v28.b }[4], [x13]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v24.b }[4], [x21]\n" + "ld1 { v26.b }[4], [x20]\n" + "ld1 { v22.b }[4], [x0]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x4, #1, 10f\n" + "ld1 { v31.h }[0], [x28], #0x2\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v29.h }[0], [x26], #0x2\n" + "ld1 { v28.h }[0], [x13], #0x2\n" + "ld1 { v27.h }[0], [x24], #0x2\n" + "ld1 { v23.h }[0], [x23], #0x2\n" + "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v24.h }[0], [x21], #0x2\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "ld1 { v22.h }[0], [x0], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[2], [x28]\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x26]\n" + "ld1 { v28.b }[2], [x13]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v24.b }[2], [x21]\n" + "ld1 { v26.b }[2], [x20]\n" + "ld1 { v22.b }[2], [x0]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[0], [x28]\n" + "ld1 { v30.b }[0], [x27]\n" + "ld1 { v29.b }[0], [x26]\n" + "ld1 { v28.b }[0], [x13]\n" + "ld1 { v27.b }[0], [x24]\n" + "ld1 { v23.b }[0], [x23]\n" + "ld1 { v25.b }[0], [x22]\n" + "ld1 { v24.b }[0], [x21]\n" + "ld1 { v26.b }[0], [x20]\n" + "ld1 { v22.b }[0], [x0]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x20, [x25, #0x50]\n" + "add x20, x20, x10\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "usubl v29.8h, v29.8b, v9.8b\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "smlal v16.4s, v30.4h, v0.4h\n" + "smlal2 v21.4s, v30.8h, v0.8h\n" + "smlal v7.4s, v29.4h, v0.4h\n" + "smlal2 v17.4s, v29.8h, v0.8h\n" + "smlal v8.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v18.4s, v30.8h, v1.8h\n" + "smlal v16.4s, v27.4h, v1.4h\n" + "smlal2 v21.4s, v27.8h, v1.8h\n" + "smlal v7.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "smlal v8.4s, v23.4h, v1.4h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "smlal v15.4s, v27.4h, v2.4h\n" + "smlal2 v18.4s, v27.8h, v2.8h\n" + "smlal v16.4s, v25.4h, v2.4h\n" + "smlal2 v21.4s, v25.8h, v2.8h\n" + "smlal v7.4s, v23.4h, v2.4h\n" + "smlal2 v17.4s, v23.8h, v2.8h\n" + "tbz x4, #2, 13f\n" + "ld1 { v31.s }[0], [x20], #0x4\n" + "tbz x4, #1, 12f\n" + "ld1 { v31.h }[2], [x20], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[6], [x20]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[4], [x20]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x4, #1, 14f\n" + "ld1 { v31.h }[0], [x20], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[2], [x20]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[0], [x20]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x28, [x25, #0x58]\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "add x28, x28, x10\n" + "smlal v8.4s, v31.4h, v2.4h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "smlal2 v18.4s, v25.8h, v3.8h\n" + "smlal v16.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "smlal v7.4s, v31.4h, v3.4h\n" + "smlal2 v17.4s, v31.8h, v3.8h\n" + "tbz x4, #2, 17f\n" + "ld1 { v30.s }[0], [x28], #0x4\n" + "tbz x4, #1, 16f\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[6], [x28]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[4], [x28]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x4, #1, 18f\n" + "ld1 { v30.h }[0], [x28], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[2], [x28]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[0], [x28]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr x0, [x25, #0x60]\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "add x0, x0, x10\n" + "smlal v8.4s, v30.4h, v3.4h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "smlal2 v18.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 21f\n" + "ld1 { v27.s }[0], [x0], #0x4\n" + "tbz x4, #1, 20f\n" + "ld1 { v27.h }[2], [x0], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[6], [x0]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[4], [x0]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 5): Bit 2: Unset + "tbz x4, #1, 22f\n" + "ld1 { v27.h }[0], [x0], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[2], [x0]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[0], [x0]\n" + "23:" // Oddments: Load (0, 5): Bit 2: End + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr d0, [x3, #0x28]\n" + "smlal v7.4s, v30.4h, v4.4h\n" + "ldr x7, [x25, #0x68]\n" + "add x7, x7, x10\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal2 v17.4s, v30.8h, v4.8h\n" + "smlal v8.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "smlal2 v18.4s, v29.8h, v0.8h\n" + "smlal v16.4s, v28.4h, v0.4h\n" + "smlal2 v21.4s, v28.8h, v0.8h\n" + "smlal v7.4s, v22.4h, v0.4h\n" + "smlal2 v17.4s, v22.8h, v0.8h\n" + "tbz x4, #2, 25f\n" + "ld1 { v25.s }[0], [x7], #0x4\n" + "tbz x4, #1, 24f\n" + "ld1 { v25.h }[2], [x7], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[6], [x7]\n" + "b 27f\n" + "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[4], [x7]\n" + "b 27f\n" + "25:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x4, #1, 26f\n" + "ld1 { v25.h }[0], [x7], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[2], [x7]\n" + "b 27f\n" + "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[0], [x7]\n" + "27:" // Oddments: Load (2, 1): Bit 2: End + "usubl v25.8h, v25.8b, v9.8b\n" + "ldr d1, [x3, #0x30]\n" + "smlal v8.4s, v25.4h, v0.4h\n" + "ldr x26, [x25, #0x70]\n" + "add x26, x26, x10\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "smlal v16.4s, v23.4h, v1.4h\n" + "smlal2 v21.4s, v23.8h, v1.8h\n" + "smlal v7.4s, v25.4h, v1.4h\n" + "smlal2 v17.4s, v25.8h, v1.8h\n" + "tbz x4, #2, 29f\n" + "ld1 { v24.s }[0], [x26], #0x4\n" + "tbz x4, #1, 28f\n" + "ld1 { v24.h }[2], [x26], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[6], [x26]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[4], [x26]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x4, #1, 30f\n" + "ld1 { v24.h }[0], [x26], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[2], [x26]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[0], [x26]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "usubl v24.8h, v24.8b, v9.8b\n" + "ldr d2, [x3, #0x38]\n" + "smlal v8.4s, v24.4h, v1.4h\n" + "ldr x23, [x25, #0x78]\n" + "add x23, x23, x10\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "smlal2 v18.4s, v23.8h, v2.8h\n" + "smlal v16.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v7.4s, v24.4h, v2.4h\n" + "smlal2 v17.4s, v24.8h, v2.8h\n" + "tbz x4, #2, 33f\n" + "ld1 { v27.s }[0], [x23], #0x4\n" + "tbz x4, #1, 32f\n" + "ld1 { v27.h }[2], [x23], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[6], [x23]\n" + "b 35f\n" + "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[4], [x23]\n" + "b 35f\n" + "33:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x4, #1, 34f\n" + "ld1 { v27.h }[0], [x23], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[2], [x23]\n" + "b 35f\n" + "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[0], [x23]\n" + "35:" // Oddments: Load (2, 3): Bit 2: End + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr d3, [x3, #0x40]\n" + "smlal v8.4s, v27.4h, v2.4h\n" + "ldr x20, [x25, #0x80]\n" + "add x20, x20, x10\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v18.4s, v31.8h, v3.8h\n" + "smlal v16.4s, v30.4h, v3.4h\n" + "smlal2 v21.4s, v30.8h, v3.8h\n" + "smlal v7.4s, v27.4h, v3.4h\n" + "smlal2 v17.4s, v27.8h, v3.8h\n" + "tbz x4, #2, 37f\n" + "ld1 { v23.s }[0], [x20], #0x4\n" + "tbz x4, #1, 36f\n" + "ld1 { v23.h }[2], [x20], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[6], [x20]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[4], [x20]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x4, #1, 38f\n" + "ld1 { v23.h }[0], [x20], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[2], [x20]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[0], [x20]\n" + "39:" // Oddments: Load (2, 4): Bit 2: End + "usubl v23.8h, v23.8b, v9.8b\n" + "ldr d4, [x3, #0x48]\n" + "smlal v8.4s, v23.4h, v3.4h\n" + "ldr x22, [x25, #0x88]\n" + "add x22, x22, x10\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v18.4s, v30.8h, v4.8h\n" + "smlal v16.4s, v26.4h, v4.4h\n" + "smlal2 v21.4s, v26.8h, v4.8h\n" + "smlal v7.4s, v23.4h, v4.4h\n" + "smlal2 v17.4s, v23.8h, v4.8h\n" + "tbz x4, #2, 41f\n" + "ld1 { v28.s }[0], [x22], #0x4\n" + "tbz x4, #1, 40f\n" + "ld1 { v28.h }[2], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 5): Bit 2: Unset + "tbz x4, #1, 42f\n" + "ld1 { v28.h }[0], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 5): Bit 2: End + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr d0, [x3, #0x50]\n" + "smlal v8.4s, v28.4h, v4.4h\n" + "ldr x13, [x25, #0x90]\n" + "add x13, x13, x10\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "smlal2 v18.4s, v22.8h, v0.8h\n" + "smlal v16.4s, v25.4h, v0.4h\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 45f\n" + "ld1 { v31.s }[0], [x13], #0x4\n" + "tbz x4, #1, 44f\n" + "ld1 { v31.h }[2], [x13], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[6], [x13]\n" + "b 47f\n" + "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[4], [x13]\n" + "b 47f\n" + "45:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x4, #1, 46f\n" + "ld1 { v31.h }[0], [x13], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[2], [x13]\n" + "b 47f\n" + "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[0], [x13]\n" + "47:" // Oddments: Load (3, 0): Bit 2: End + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x21, [x25, #0x98]\n" + "smlal v7.4s, v31.4h, v0.4h\n" + "add x21, x21, x10\n" + "smlal2 v17.4s, v31.8h, v0.8h\n" + "tbz x4, #2, 49f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x4, #1, 48f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x4, #1, 50f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[0], [x21]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d1, [x3, #0x58]\n" + "smlal v8.4s, v30.4h, v0.4h\n" + "ldr x14, [x25, #0xa0]\n" + "add x14, x14, x10\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v15.4s, v25.4h, v1.4h\n" + "smlal2 v18.4s, v25.8h, v1.8h\n" + "smlal v16.4s, v24.4h, v1.4h\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "smlal v7.4s, v30.4h, v1.4h\n" + "smlal2 v17.4s, v30.8h, v1.8h\n" + "tbz x4, #2, 53f\n" + "ld1 { v26.s }[0], [x14], #0x4\n" + "tbz x4, #1, 52f\n" + "ld1 { v26.h }[2], [x14], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[6], [x14]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[4], [x14]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x4, #1, 54f\n" + "ld1 { v26.h }[0], [x14], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[2], [x14]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[0], [x14]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "usubl v26.8h, v26.8b, v9.8b\n" + "ldr d2, [x3, #0x60]\n" + "smlal v8.4s, v26.4h, v1.4h\n" + "ldr x11, [x25, #0xa8]\n" + "add x11, x11, x10\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v18.4s, v24.8h, v2.8h\n" + "smlal v16.4s, v27.4h, v2.4h\n" + "smlal2 v21.4s, v27.8h, v2.8h\n" + "smlal v7.4s, v26.4h, v2.4h\n" + "smlal2 v17.4s, v26.8h, v2.8h\n" + "tbz x4, #2, 57f\n" + "ld1 { v25.s }[0], [x11], #0x4\n" + "tbz x4, #1, 56f\n" + "ld1 { v25.h }[2], [x11], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[6], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[4], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x4, #1, 58f\n" + "ld1 { v25.h }[0], [x11], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[0], [x11]\n" + "59:" // Oddments: Load (3, 3): Bit 2: End + "usubl v25.8h, v25.8b, v9.8b\n" + "ldr d3, [x3, #0x68]\n" + "smlal v8.4s, v25.4h, v2.4h\n" + "ldr x24, [x25, #0xb0]\n" + "add x24, x24, x10\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v18.4s, v27.8h, v3.8h\n" + "smlal v16.4s, v23.4h, v3.4h\n" + "smlal2 v21.4s, v23.8h, v3.8h\n" + "smlal v7.4s, v25.4h, v3.4h\n" + "smlal2 v17.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 61f\n" + "ld1 { v24.s }[0], [x24], #0x4\n" + "tbz x4, #1, 60f\n" + "ld1 { v24.h }[2], [x24], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[6], [x24]\n" + "b 63f\n" + "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[4], [x24]\n" + "b 63f\n" + "61:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x4, #1, 62f\n" + "ld1 { v24.h }[0], [x24], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[2], [x24]\n" + "b 63f\n" + "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[0], [x24]\n" + "63:" // Oddments: Load (3, 4): Bit 2: End + "usubl v24.8h, v24.8b, v9.8b\n" + "ldr d4, [x3, #0x70]\n" + "smlal v8.4s, v24.4h, v3.4h\n" + "ldr x0, [x25, #0xb8]\n" + "add x0, x0, x10\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v18.4s, v23.8h, v4.8h\n" + "smlal v16.4s, v28.4h, v4.4h\n" + "smlal2 v21.4s, v28.8h, v4.8h\n" + "smlal v7.4s, v24.4h, v4.4h\n" + "smlal2 v17.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 65f\n" + "ld1 { v22.s }[0], [x0], #0x4\n" + "tbz x4, #1, 64f\n" + "ld1 { v22.h }[2], [x0], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[6], [x0]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[4], [x0]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 5): Bit 2: Unset + "tbz x4, #1, 66f\n" + "ld1 { v22.h }[0], [x0], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[2], [x0]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[0], [x0]\n" + "67:" // Oddments: Load (3, 5): Bit 2: End + "usubl v22.8h, v22.8b, v9.8b\n" + "ldr d0, [x3, #0x78]\n" + "smlal v8.4s, v22.4h, v4.4h\n" + "ldr x15, [x25, #0xc0]\n" + "add x15, x15, x10\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "smlal v16.4s, v30.4h, v0.4h\n" + "smlal2 v21.4s, v30.8h, v0.8h\n" + "tbz x4, #2, 69f\n" + "ld1 { v27.s }[0], [x15], #0x4\n" + "tbz x4, #1, 68f\n" + "ld1 { v27.h }[2], [x15], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[6], [x15]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[4], [x15]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x4, #1, 70f\n" + "ld1 { v27.h }[0], [x15], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[2], [x15]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[0], [x15]\n" + "71:" // Oddments: Load (4, 0): Bit 2: End + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr x9, [x25, #0xc8]\n" + "smlal v7.4s, v27.4h, v0.4h\n" + "add x9, x9, x10\n" + "smlal2 v17.4s, v27.8h, v0.8h\n" + "tbz x4, #2, 73f\n" + "ld1 { v23.s }[0], [x9], #0x4\n" + "tbz x4, #1, 72f\n" + "ld1 { v23.h }[2], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[6], [x9]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[4], [x9]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x4, #1, 74f\n" + "ld1 { v23.h }[0], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[2], [x9]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[0], [x9]\n" + "75:" // Oddments: Load (4, 1): Bit 2: End + "usubl v23.8h, v23.8b, v9.8b\n" + "ldr d1, [x3, #0x80]\n" + "smlal v8.4s, v23.4h, v0.4h\n" + "ldr x27, [x25, #0xd0]\n" + "add x27, x27, x10\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v1.4h\n" + "smlal2 v18.4s, v30.8h, v1.8h\n" + "smlal v16.4s, v26.4h, v1.4h\n" + "smlal2 v21.4s, v26.8h, v1.8h\n" + "smlal v7.4s, v23.4h, v1.4h\n" + "smlal2 v17.4s, v23.8h, v1.8h\n" + "tbz x4, #2, 77f\n" + "ld1 { v31.s }[0], [x27], #0x4\n" + "tbz x4, #1, 76f\n" + "ld1 { v31.h }[2], [x27], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[6], [x27]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[4], [x27]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x4, #1, 78f\n" + "ld1 { v31.h }[0], [x27], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[2], [x27]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[0], [x27]\n" + "79:" // Oddments: Load (4, 2): Bit 2: End + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr d2, [x3, #0x88]\n" + "smlal v8.4s, v31.4h, v1.4h\n" + "ldr x28, [x25, #0xd8]\n" + "add x28, x28, x10\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v18.4s, v26.8h, v2.8h\n" + "smlal v16.4s, v25.4h, v2.4h\n" + "smlal2 v21.4s, v25.8h, v2.8h\n" + "smlal v7.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "tbz x4, #2, 81f\n" + "ld1 { v30.s }[0], [x28], #0x4\n" + "tbz x4, #1, 80f\n" + "ld1 { v30.h }[2], [x28], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[6], [x28]\n" + "b 83f\n" + "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[4], [x28]\n" + "b 83f\n" + "81:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x4, #1, 82f\n" + "ld1 { v30.h }[0], [x28], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[2], [x28]\n" + "b 83f\n" + "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[0], [x28]\n" + "83:" // Oddments: Load (4, 3): Bit 2: End + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d3, [x3, #0x90]\n" + "smlal v8.4s, v30.4h, v2.4h\n" + "ldr x12, [x25, #0xe0]\n" + "add x12, x12, x10\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v18.4s, v25.8h, v3.8h\n" + "smlal v16.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v24.8h, v3.8h\n" + "smlal v7.4s, v30.4h, v3.4h\n" + "smlal2 v17.4s, v30.8h, v3.8h\n" + "tbz x4, #2, 85f\n" + "ld1 { v28.s }[0], [x12], #0x4\n" + "tbz x4, #1, 84f\n" + "ld1 { v28.h }[2], [x12], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[6], [x12]\n" + "b 87f\n" + "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[4], [x12]\n" + "b 87f\n" + "85:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x4, #1, 86f\n" + "ld1 { v28.h }[0], [x12], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[2], [x12]\n" + "b 87f\n" + "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[0], [x12]\n" + "87:" // Oddments: Load (4, 4): Bit 2: End + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr d4, [x3, #0x98]\n" + "smlal v8.4s, v28.4h, v3.4h\n" + "ldr x7, [x25, #0xe8]\n" + "add x7, x7, x10\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v18.4s, v24.8h, v4.8h\n" + "smlal v16.4s, v22.4h, v4.4h\n" + "smlal2 v21.4s, v22.8h, v4.8h\n" + "smlal v7.4s, v28.4h, v4.4h\n" + "smlal2 v17.4s, v28.8h, v4.8h\n" + "tbz x4, #2, 89f\n" + "ld1 { v26.s }[0], [x7], #0x4\n" + "tbz x4, #1, 88f\n" + "ld1 { v26.h }[2], [x7], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[6], [x7]\n" + "b 91f\n" + "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[4], [x7]\n" + "b 91f\n" + "89:" // Oddments: Load (4, 5): Bit 2: Unset + "tbz x4, #1, 90f\n" + "ld1 { v26.h }[0], [x7], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[2], [x7]\n" + "b 91f\n" + "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[0], [x7]\n" + "91:" // Oddments: Load (4, 5): Bit 2: End + "usubl v26.8h, v26.8b, v9.8b\n" + "ldr d0, [x3, #0xa0]\n" + "smlal v8.4s, v26.4h, v4.4h\n" + "ldr x26, [x25, #0xf0]\n" + "add x26, x26, x10\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v15.4s, v27.4h, v0.4h\n" + "smlal2 v18.4s, v27.8h, v0.8h\n" + "smlal v16.4s, v23.4h, v0.4h\n" + "smlal2 v21.4s, v23.8h, v0.8h\n" + "tbz x4, #2, 93f\n" + "ld1 { v25.s }[0], [x26], #0x4\n" + "tbz x4, #1, 92f\n" + "ld1 { v25.h }[2], [x26], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[6], [x26]\n" + "b 95f\n" + "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[4], [x26]\n" + "b 95f\n" + "93:" // Oddments: Load (5, 0): Bit 2: Unset + "tbz x4, #1, 94f\n" + "ld1 { v25.h }[0], [x26], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[2], [x26]\n" + "b 95f\n" + "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[0], [x26]\n" + "95:" // Oddments: Load (5, 0): Bit 2: End + "usubl v25.8h, v25.8b, v9.8b\n" + "ldr x23, [x25, #0xf8]\n" + "smlal v7.4s, v25.4h, v0.4h\n" + "add x23, x23, x10\n" + "smlal2 v17.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 97f\n" + "ld1 { v24.s }[0], [x23], #0x4\n" + "tbz x4, #1, 96f\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[6], [x23]\n" + "b 99f\n" + "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[4], [x23]\n" + "b 99f\n" + "97:" // Oddments: Load (5, 1): Bit 2: Unset + "tbz x4, #1, 98f\n" + "ld1 { v24.h }[0], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[2], [x23]\n" + "b 99f\n" + "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[0], [x23]\n" + "99:" // Oddments: Load (5, 1): Bit 2: End + "usubl v24.8h, v24.8b, v9.8b\n" + "ldr d1, [x3, #0xa8]\n" + "smlal v8.4s, v24.4h, v0.4h\n" + "ldr x22, [x25, #0x100]\n" + "add x22, x22, x10\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v15.4s, v23.4h, v1.4h\n" + "smlal2 v18.4s, v23.8h, v1.8h\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "smlal2 v21.4s, v31.8h, v1.8h\n" + "smlal v7.4s, v24.4h, v1.4h\n" + "smlal2 v17.4s, v24.8h, v1.8h\n" + "tbz x4, #2, 101f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x4, #1, 100f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[6], [x22]\n" + "b 103f\n" + "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[4], [x22]\n" + "b 103f\n" + "101:" // Oddments: Load (5, 2): Bit 2: Unset + "tbz x4, #1, 102f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[2], [x22]\n" + "b 103f\n" + "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[0], [x22]\n" + "103:" // Oddments: Load (5, 2): Bit 2: End + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr d2, [x3, #0xb0]\n" + "smlal v8.4s, v27.4h, v1.4h\n" + "ldr x20, [x25, #0x108]\n" + "add x20, x20, x10\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v15.4s, v31.4h, v2.4h\n" + "smlal2 v18.4s, v31.8h, v2.8h\n" + "smlal v16.4s, v30.4h, v2.4h\n" + "smlal2 v21.4s, v30.8h, v2.8h\n" + "smlal v7.4s, v27.4h, v2.4h\n" + "smlal2 v17.4s, v27.8h, v2.8h\n" + "tbz x4, #2, 105f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x4, #1, 104f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 107f\n" + "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 107f\n" + "105:" // Oddments: Load (5, 3): Bit 2: Unset + "tbz x4, #1, 106f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 107f\n" + "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[0], [x20]\n" + "107:" // Oddments: Load (5, 3): Bit 2: End + "usubl v25.8h, v25.8b, v9.8b\n" + "ldr d3, [x3, #0xb8]\n" + "smlal v8.4s, v25.4h, v2.4h\n" + "ldr x13, [x25, #0x110]\n" + "add x13, x13, x10\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v15.4s, v30.4h, v3.4h\n" + "smlal2 v18.4s, v30.8h, v3.8h\n" + "smlal v16.4s, v28.4h, v3.4h\n" + "smlal2 v21.4s, v28.8h, v3.8h\n" + "smlal v7.4s, v25.4h, v3.4h\n" + "smlal2 v17.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 109f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "tbz x4, #1, 108f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[6], [x13]\n" + "b 111f\n" + "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[4], [x13]\n" + "b 111f\n" + "109:" // Oddments: Load (5, 4): Bit 2: Unset + "tbz x4, #1, 110f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[2], [x13]\n" + "b 111f\n" + "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[0], [x13]\n" + "111:" // Oddments: Load (5, 4): Bit 2: End + "usubl v24.8h, v24.8b, v9.8b\n" + "ldr d4, [x3, #0xc0]\n" + "smlal v8.4s, v24.4h, v3.4h\n" + "ldr x21, [x25, #0x118]\n" + "add x21, x21, x10\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v15.4s, v28.4h, v4.4h\n" + "smlal2 v18.4s, v28.8h, v4.8h\n" + "smlal v16.4s, v26.4h, v4.4h\n" + "smlal2 v21.4s, v26.8h, v4.8h\n" + "smlal v7.4s, v24.4h, v4.4h\n" + "smlal2 v17.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 113f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x4, #1, 112f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[6], [x21]\n" + "b 115f\n" + "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[4], [x21]\n" + "b 115f\n" + "113:" // Oddments: Load (5, 5): Bit 2: Unset + "tbz x4, #1, 114f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[2], [x21]\n" + "b 115f\n" + "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[0], [x21]\n" + "115:" // Oddments: Load (5, 5): Bit 2: End + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v8.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "tbz x4, #2, 117f\n" + "ld1 { v6.4s }, [x2], #0x10\n" + "ld1 { v19.4s }, [x5], #0x10\n" + "tbz x4, #1, 116f\n" + "ld1 { v20.d }[0], [x2], #0x8\n" + "ld1 { v12.d }[0], [x5], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v20.s }[2], [x2]\n" + "ld1 { v12.s }[2], [x5]\n" + "b 119f\n" + "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v20.s }[0], [x2]\n" + "ld1 { v12.s }[0], [x5]\n" + "b 119f\n" + "117:" // Oddments: Load requant params: Bit 2: Unset + "tbz x4, #1, 118f\n" + "ld1 { v6.d }[0], [x2], #0x8\n" + "ld1 { v19.d }[0], [x5], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v6.s }[2], [x2]\n" + "ld1 { v19.s }[2], [x5]\n" + "b 119f\n" + "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v6.s }[0], [x2]\n" + "ld1 { v19.s }[0], [x5]\n" + "119:" // Oddments: Load requant params: Bit 2: End + "sqrdmulh v15.4s, v15.4s, v6.4s\n" + "add x17, x17, x1\n" + "sqrdmulh v18.4s, v18.4s, v20.4s\n" + "add x16, x16, x1\n" + "sqrdmulh v16.4s, v16.4s, v6.4s\n" + "add x6, x6, x1\n" + "sqrdmulh v21.4s, v21.4s, v20.4s\n" + "add x8, x8, x1\n" + "sqrdmulh v7.4s, v7.4s, v6.4s\n" + "and v28.16b, v15.16b, v19.16b\n" + "and v26.16b, v18.16b, v12.16b\n" + "and v29.16b, v16.16b, v19.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sqadd v15.4s, v15.4s, v28.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "sqadd v16.4s, v16.4s, v29.4s\n" + "and v4.16b, v21.16b, v12.16b\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "srshl v18.4s, v18.4s, v12.4s\n" + "srshl v16.4s, v16.4s, v19.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "add v15.4s, v15.4s, v10.4s\n" + "add v18.4s, v18.4s, v10.4s\n" + "add v16.4s, v16.4s, v10.4s\n" + "smin v15.4s, v15.4s, v13.4s\n" + "smin v18.4s, v18.4s, v13.4s\n" + "smin v16.4s, v16.4s, v13.4s\n" + "smax v15.4s, v15.4s, v11.4s\n" + "smax v18.4s, v18.4s, v11.4s\n" + "smax v16.4s, v16.4s, v11.4s\n" + "sqadd v21.4s, v21.4s, v4.4s\n" + "uzp1 v15.16b, v15.16b, v18.16b\n" + "and v25.16b, v7.16b, v19.16b\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "srshl v21.4s, v21.4s, v12.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sqrdmulh v17.4s, v17.4s, v20.4s\n" + "sqrdmulh v8.4s, v8.4s, v6.4s\n" + "add v21.4s, v21.4s, v10.4s\n" + "sqadd v7.4s, v7.4s, v25.4s\n" + "and v31.16b, v17.16b, v12.16b\n" + "smin v21.4s, v21.4s, v13.4s\n" + "and v24.16b, v8.16b, v19.16b\n" + "srshl v7.4s, v7.4s, v19.4s\n" + "smax v21.4s, v21.4s, v11.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sshr v24.4s, v24.4s, #0x1f\n" + "uzp1 v16.16b, v16.16b, v21.16b\n" + "add v7.4s, v7.4s, v10.4s\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "sqadd v17.4s, v17.4s, v31.4s\n" + "smin v7.4s, v7.4s, v13.4s\n" + "sqadd v8.4s, v8.4s, v24.4s\n" + "sqrdmulh v5.4s, v5.4s, v20.4s\n" + "smax v7.4s, v7.4s, v11.4s\n" + "srshl v17.4s, v17.4s, v12.4s\n" + "srshl v8.4s, v8.4s, v19.4s\n" + "and v1.16b, v5.16b, v12.16b\n" + "add v17.4s, v17.4s, v10.4s\n" + "add v8.4s, v8.4s, v10.4s\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "smin v17.4s, v17.4s, v13.4s\n" + "smin v8.4s, v8.4s, v13.4s\n" + "sqadd v5.4s, v5.4s, v1.4s\n" + "smax v17.4s, v17.4s, v11.4s\n" + "smax v8.4s, v8.4s, v11.4s\n" + "srshl v5.4s, v5.4s, v12.4s\n" + "uzp1 v7.16b, v7.16b, v17.16b\n" + "uzp1 v7.16b, v7.16b, v7.16b\n" + "add v5.4s, v5.4s, v10.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "smax v5.4s, v5.4s, v11.4s\n" + "uzp1 v8.16b, v8.16b, v5.16b\n" + "uzp1 v8.16b, v8.16b, v8.16b\n" + "tbz x4, #2, 121f\n" + "st1 { v15.s }[0], [x17], #0x4\n" + "st1 { v16.s }[0], [x16], #0x4\n" + "st1 { v7.s }[0], [x6], #0x4\n" + "st1 { v8.s }[0], [x8], #0x4\n" + "tbz x4, #1, 120f\n" + "st1 { v15.h }[2], [x17], #0x2\n" + "st1 { v16.h }[2], [x16], #0x2\n" + "st1 { v7.h }[2], [x6], #0x2\n" + "st1 { v8.h }[2], [x8], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v15.b }[6], [x17], #0x1\n" + "st1 { v16.b }[6], [x16], #0x1\n" + "st1 { v7.b }[6], [x6], #0x1\n" + "st1 { v8.b }[6], [x8], #0x1\n" + "b 123f\n" + "120:" // Oddments: Bit 2: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v15.b }[4], [x17], #0x1\n" + "st1 { v16.b }[4], [x16], #0x1\n" + "st1 { v7.b }[4], [x6], #0x1\n" + "st1 { v8.b }[4], [x8], #0x1\n" + "b 123f\n" + "121:" // Oddments: Bit 2: Unset + "tbz x4, #1, 122f\n" + "st1 { v15.h }[0], [x17], #0x2\n" + "st1 { v16.h }[0], [x16], #0x2\n" + "st1 { v7.h }[0], [x6], #0x2\n" + "st1 { v8.h }[0], [x8], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v15.b }[2], [x17], #0x1\n" + "st1 { v16.b }[2], [x16], #0x1\n" + "st1 { v7.b }[2], [x6], #0x1\n" + "st1 { v8.b }[2], [x8], #0x1\n" + "b 123f\n" + "122:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v15.b }[0], [x17], #0x1\n" + "st1 { v16.b }[0], [x16], #0x1\n" + "st1 { v7.b }[0], [x6], #0x1\n" + "st1 { v8.b }[0], [x8], #0x1\n" + "123:" // Oddments: Bit 2: End + + "124:" // End + + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp new file mode 100644 index 0000000000..2bfeac0556 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); + +struct a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int n_output_points = 9; + + kern_type kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl; + + a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..1633639ad5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const void *params, + const arm_gemm::Requantize32& qp, + const unsigned int n_points, + const unsigned int n_channels +) +{ + __asm__ __volatile__( + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v12.4s }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" + "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n" + "ld1r { v11.4s }, [x20]\n" + "ld1r { v10.16b }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v9.16b }, [x20]\n" + "ld1r { v8.4s }, [x19]\n" + "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n" + "ld1r { v7.4s }, [x20]\n" + "ld1r { v6.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n" + "mov x11, #0x0\n" + "ld1r { v5.4s }, [x19]\n" + "lsr x10, %x[n_channels], #0x2\n" + "cbz x10, 6f\n" + "1:" // Channel loop + "movi v27.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x11, #0x2\n" + "ldr q27, [%x[bias], x19]\n" + "2:" // Channel loop: Load bias: Done + "mov v26.16b, v27.16b\n" + "ldr s16, [%x[params]], #0x4\n" + "mov x20, %x[inptrs]\n" + "mov v25.16b, v27.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, %x[n_points], #0x1\n" + "mov v24.16b, v27.16b\n" + "ldr s4, [x9, x11]\n" + "mov v23.16b, v27.16b\n" + "mov v22.16b, v27.16b\n" + "ldr s3, [x28, x11]\n" + "mov v21.16b, v27.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v20.16b, v27.16b\n" + "ldr s2, [x27, x11]\n" + "mov v19.16b, v27.16b\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "ldr s1, [x26, x11]\n" + "usubl v4.8h, v4.8b, v10.8b\n" + "ldp x25, x24, [x20], #0x10\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "ldr s0, [x25, x11]\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "ldr s31, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "ldr s30, [x23, x11]\n" + "ldr s29, [x22, x11]\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "ldr x21, [x20], #0x8\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "ldr s28, [x21, x11]\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "ble 4f\n" + "3:" // Channel loop: Planar loop + "smlal v27.4s, v4.4h, v16.4h\n" + "ldp x9, x28, [x20], #0x10\n" + "subs x19, x19, #0x1\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "ldr s4, [x9, x11]\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "ldr s3, [x28, x11]\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "ldp x27, x26, [x20], #0x10\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "ldr s2, [x27, x11]\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "ldr s16, [%x[params]], #0x4\n" + "usubl v4.8h, v4.8b, v10.8b\n" + "ldr s1, [x26, x11]\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "ldp x25, x24, [x20], #0x10\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "ldr s0, [x25, x11]\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "ldr s31, [x24, x11]\n" + "ldp x23, x22, [x20], #0x10\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "ldr s30, [x23, x11]\n" + "ldr s29, [x22, x11]\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "ldr x21, [x20], #0x8\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "ldr s28, [x21, x11]\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "bgt 3b\n" + "4:" // Channel loop: Planar tail + "smlal v27.4s, v4.4h, v16.4h\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "cbz %x[rq_mul_ptr], 5f\n" + "lsl x19, x11, #0x2\n" + "ldr q6, [%x[rq_mul_ptr], x19]\n" + "ldr q5, [%x[rq_right_shift_ptr], x19]\n" + "cbz %x[rq_left_shift_ptr], 5f\n" + "ldr q7, [%x[rq_left_shift_ptr], x19]\n" + "5:" // Channel loop: Load quantisation parameters: Done + "sshl v27.4s, v27.4s, v7.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "sshl v26.4s, v26.4s, v7.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "sshl v25.4s, v25.4s, v7.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "sqrdmulh v27.4s, v27.4s, v6.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "sqrdmulh v25.4s, v25.4s, v6.4s\n" + "sshl v24.4s, v24.4s, v7.4s\n" + "and v16.16b, v27.16b, v5.16b\n" + "and v18.16b, v26.16b, v5.16b\n" + "and v17.16b, v25.16b, v5.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sqadd v26.4s, v26.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v17.4s\n" + "sqrdmulh v24.4s, v24.4s, v6.4s\n" + "srshl v27.4s, v27.4s, v5.4s\n" + "srshl v26.4s, v26.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v5.4s\n" + "and v16.16b, v24.16b, v5.16b\n" + "add v27.4s, v27.4s, v8.4s\n" + "add v26.4s, v26.4s, v8.4s\n" + "add v25.4s, v25.4s, v8.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v27.4s, v27.4s, v12.4s\n" + "smax v26.4s, v26.4s, v12.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "smin v27.4s, v27.4s, v11.4s\n" + "smin v26.4s, v26.4s, v11.4s\n" + "smax v25.4s, v25.4s, v12.4s\n" + "srshl v24.4s, v24.4s, v5.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "smin v25.4s, v25.4s, v11.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x27, x11]\n" + "add v24.4s, v24.4s, v8.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x26, x11]\n" + "smax v24.4s, v24.4s, v12.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x25, x11]\n" + "sshl v23.4s, v23.4s, v7.4s\n" + "sshl v22.4s, v22.4s, v7.4s\n" + "smin v24.4s, v24.4s, v11.4s\n" + "sqrdmulh v23.4s, v23.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sshl v21.4s, v21.4s, v7.4s\n" + "and v17.16b, v23.16b, v5.16b\n" + "and v16.16b, v22.16b, v5.16b\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x24, x11]\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v21.16b, v5.16b\n" + "sshl v20.4s, v20.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v7.4s\n" + "srshl v23.4s, v23.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v5.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v6.4s\n" + "add v23.4s, v23.4s, v8.4s\n" + "add v22.4s, v22.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "and v17.16b, v20.16b, v5.16b\n" + "sqrdmulh v19.4s, v19.4s, v6.4s\n" + "smax v23.4s, v23.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v5.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v19.16b, v5.16b\n" + "smin v23.4s, v23.4s, v11.4s\n" + "add v21.4s, v21.4s, v8.4s\n" + "sqadd v20.4s, v20.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v22.4s, v22.4s, v12.4s\n" + "smax v21.4s, v21.4s, v12.4s\n" + "srshl v20.4s, v20.4s, v5.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "smin v21.4s, v21.4s, v11.4s\n" + "add v20.4s, v20.4s, v8.4s\n" + "srshl v19.4s, v19.4s, v5.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x23, x11]\n" + "add v19.4s, v19.4s, v8.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "smax v19.4s, v19.4s, v12.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x22, x11]\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x21, x11]\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x20, x11]\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x19, x11]\n" + "add x11, x11, #0x4\n" + "cmp x11, x10, LSL #2\n" + "blt 1b\n" + "6:" // Oddments + "tst %x[n_channels], #0x3\n" + "beq 24f\n" + "movi v27.4s, #0x0\n" + "cbz %x[bias], 9f\n" + "add x19, %x[bias], x11, LSL #2\n" + "tbz %x[n_channels], #1, 7f\n" + "ld1 { v27.d }[0], [x19], #0x8\n" + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v27.s }[2], [x19], #0x4\n" + "b 8f\n" + "7:" // Oddments: Load bias: Bit 1: Unset + "tbz %x[n_channels], #0, 8f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "8:" // Oddments: Load bias: Bit 1: End + + "9:" // Oddments: Load bias: Done + "mov v26.16b, v27.16b\n" + "ldr s16, [%x[params]], #0x4\n" + "mov x20, %x[inptrs]\n" + "mov v25.16b, v27.16b\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "mov v24.16b, v27.16b\n" + "ldp x27, x26, [x20], #0x10\n" + "mov v23.16b, v27.16b\n" + "ldp x25, x24, [x20], #0x10\n" + "mov v22.16b, v27.16b\n" + "add x28, x28, x11\n" + "mov v21.16b, v27.16b\n" + "ldp x23, x22, [x20], #0x10\n" + "mov v20.16b, v27.16b\n" + "add x27, x27, x11\n" + "mov v19.16b, v27.16b\n" + "ldr x21, [x20], #0x8\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "add x26, x26, x11\n" + "add x25, x25, x11\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 10f\n" + "ldr h4, [x9], #0x2\n" + "ldr h3, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" + "ldr h31, [x24], #0x2\n" + "ldr h30, [x23], #0x2\n" + "ldr h29, [x22], #0x2\n" + "ldr h28, [x21], #0x2\n" + "tbz %x[n_channels], #0, 11f\n" + "ld1 { v4.b }[2], [x9], #0x1\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v2.b }[2], [x27], #0x1\n" + "ld1 { v1.b }[2], [x26], #0x1\n" + "ld1 { v0.b }[2], [x25], #0x1\n" + "ld1 { v31.b }[2], [x24], #0x1\n" + "ld1 { v30.b }[2], [x23], #0x1\n" + "ld1 { v29.b }[2], [x22], #0x1\n" + "ld1 { v28.b }[2], [x21], #0x1\n" + "b 11f\n" + "10:" // Oddments: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 11f\n" + "ldr b4, [x9], #0x1\n" + "ldr b3, [x28], #0x1\n" + "ldr b2, [x27], #0x1\n" + "ldr b1, [x26], #0x1\n" + "ldr b0, [x25], #0x1\n" + "ldr b31, [x24], #0x1\n" + "ldr b30, [x23], #0x1\n" + "ldr b29, [x22], #0x1\n" + "ldr b28, [x21], #0x1\n" + "11:" // Oddments: Load: Bit 1: End + "usubl v4.8h, v4.8b, v10.8b\n" + "subs x19, %x[n_points], #0x1\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "ble 15f\n" + "12:" // Oddments: Planar loop + "smlal v27.4s, v4.4h, v16.4h\n" + "ldp x9, x28, [x20], #0x10\n" + "add x9, x9, x11\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "ldp x27, x26, [x20], #0x10\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "ldp x25, x24, [x20], #0x10\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "add x28, x28, x11\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "ldp x23, x22, [x20], #0x10\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "add x27, x27, x11\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "ldr x21, [x20], #0x8\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "add x26, x26, x11\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "ldr s16, [%x[params]], #0x4\n" + "add x25, x25, x11\n" + "ssubl v16.8h, v16.8b, v9.8b\n" + "add x24, x24, x11\n" + "add x23, x23, x11\n" + "add x22, x22, x11\n" + "add x21, x21, x11\n" + "tbz %x[n_channels], #1, 13f\n" + "ldr h4, [x9], #0x2\n" + "ldr h3, [x28], #0x2\n" + "ldr h2, [x27], #0x2\n" + "ldr h1, [x26], #0x2\n" + "ldr h0, [x25], #0x2\n" + "ldr h31, [x24], #0x2\n" + "ldr h30, [x23], #0x2\n" + "ldr h29, [x22], #0x2\n" + "ldr h28, [x21], #0x2\n" + "tbz %x[n_channels], #0, 14f\n" + "ld1 { v4.b }[2], [x9], #0x1\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v2.b }[2], [x27], #0x1\n" + "ld1 { v1.b }[2], [x26], #0x1\n" + "ld1 { v0.b }[2], [x25], #0x1\n" + "ld1 { v31.b }[2], [x24], #0x1\n" + "ld1 { v30.b }[2], [x23], #0x1\n" + "ld1 { v29.b }[2], [x22], #0x1\n" + "ld1 { v28.b }[2], [x21], #0x1\n" + "b 14f\n" + "13:" // Oddments: Planar loop: Load: Bit 1: Unset + "tbz %x[n_channels], #0, 14f\n" + "ldr b4, [x9], #0x1\n" + "ldr b3, [x28], #0x1\n" + "ldr b2, [x27], #0x1\n" + "ldr b1, [x26], #0x1\n" + "ldr b0, [x25], #0x1\n" + "ldr b31, [x24], #0x1\n" + "ldr b30, [x23], #0x1\n" + "ldr b29, [x22], #0x1\n" + "ldr b28, [x21], #0x1\n" + "14:" // Oddments: Planar loop: Load: Bit 1: End + "usubl v4.8h, v4.8b, v10.8b\n" + "subs x19, x19, #0x1\n" + "usubl v3.8h, v3.8b, v10.8b\n" + "usubl v2.8h, v2.8b, v10.8b\n" + "usubl v1.8h, v1.8b, v10.8b\n" + "usubl v0.8h, v0.8b, v10.8b\n" + "usubl v31.8h, v31.8b, v10.8b\n" + "usubl v30.8h, v30.8b, v10.8b\n" + "usubl v29.8h, v29.8b, v10.8b\n" + "usubl v28.8h, v28.8b, v10.8b\n" + "bgt 12b\n" + "15:" // Oddments: Planar tail + "smlal v27.4s, v4.4h, v16.4h\n" + "smlal v26.4s, v3.4h, v16.4h\n" + "smlal v25.4s, v2.4h, v16.4h\n" + "smlal v24.4s, v1.4h, v16.4h\n" + "smlal v23.4s, v0.4h, v16.4h\n" + "smlal v22.4s, v31.4h, v16.4h\n" + "smlal v21.4s, v30.4h, v16.4h\n" + "smlal v20.4s, v29.4h, v16.4h\n" + "smlal v19.4s, v28.4h, v16.4h\n" + "cbz %x[rq_mul_ptr], 21f\n" + "add x21, %x[rq_mul_ptr], x11, LSL #2\n" + "add x20, %x[rq_right_shift_ptr], x11, LSL #2\n" + "add x19, %x[rq_left_shift_ptr], x11, LSL #2\n" + "tbz %x[n_channels], #1, 18f\n" + "ld1 { v6.d }[0], [x21], #0x8\n" + "ld1 { v5.d }[0], [x20], #0x8\n" + "cbz %x[rq_left_shift_ptr], 16f\n" + "ld1 { v7.d }[0], [x19], #0x8\n" + "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v6.s }[2], [x21], #0x4\n" + "ld1 { v5.s }[2], [x20], #0x4\n" + "cbz %x[rq_left_shift_ptr], 17f\n" + "ld1 { v7.s }[2], [x19], #0x4\n" + "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done + "b 20f\n" + "18:" // Oddments: Load quantisation parameters: Bit 1: Unset + "tbz %x[n_channels], #0, 20f\n" + "ld1 { v6.s }[0], [x21], #0x4\n" + "ld1 { v5.s }[0], [x20], #0x4\n" + "cbz %x[rq_left_shift_ptr], 19f\n" + "ld1 { v7.s }[0], [x19], #0x4\n" + "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done + + "20:" // Oddments: Load quantisation parameters: Bit 1: End + + "21:" // Oddments: Load quantisation parameters: Done + "sshl v27.4s, v27.4s, v7.4s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "add x27, x27, x11\n" + "sqrdmulh v27.4s, v27.4s, v6.4s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "sshl v26.4s, v26.4s, v7.4s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "add x26, x26, x11\n" + "sshl v25.4s, v25.4s, v7.4s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "sshl v24.4s, v24.4s, v7.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x25, x25, x11\n" + "and v16.16b, v27.16b, v5.16b\n" + "add x24, x24, x11\n" + "sqrdmulh v26.4s, v26.4s, v6.4s\n" + "add x23, x23, x11\n" + "sqrdmulh v25.4s, v25.4s, v6.4s\n" + "add x22, x22, x11\n" + "sqrdmulh v24.4s, v24.4s, v6.4s\n" + "add x21, x21, x11\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add x20, x20, x11\n" + "and v18.16b, v26.16b, v5.16b\n" + "add x19, x19, x11\n" + "and v17.16b, v25.16b, v5.16b\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v24.16b, v5.16b\n" + "srshl v27.4s, v27.4s, v5.4s\n" + "sqadd v26.4s, v26.4s, v18.4s\n" + "sqadd v25.4s, v25.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v27.4s, v27.4s, v8.4s\n" + "srshl v26.4s, v26.4s, v5.4s\n" + "srshl v25.4s, v25.4s, v5.4s\n" + "sqadd v24.4s, v24.4s, v16.4s\n" + "smax v27.4s, v27.4s, v12.4s\n" + "add v26.4s, v26.4s, v8.4s\n" + "add v25.4s, v25.4s, v8.4s\n" + "srshl v24.4s, v24.4s, v5.4s\n" + "smin v27.4s, v27.4s, v11.4s\n" + "smax v26.4s, v26.4s, v12.4s\n" + "smax v25.4s, v25.4s, v12.4s\n" + "add v24.4s, v24.4s, v8.4s\n" + "smin v26.4s, v26.4s, v11.4s\n" + "smin v25.4s, v25.4s, v11.4s\n" + "smax v24.4s, v24.4s, v12.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v24.4s, v24.4s, v11.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sshl v23.4s, v23.4s, v7.4s\n" + "sshl v22.4s, v22.4s, v7.4s\n" + "sqrdmulh v23.4s, v23.4s, v6.4s\n" + "sqrdmulh v22.4s, v22.4s, v6.4s\n" + "sshl v21.4s, v21.4s, v7.4s\n" + "sshl v20.4s, v20.4s, v7.4s\n" + "and v17.16b, v23.16b, v5.16b\n" + "and v16.16b, v22.16b, v5.16b\n" + "sqrdmulh v21.4s, v21.4s, v6.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v20.4s, v20.4s, v6.4s\n" + "sqadd v23.4s, v23.4s, v17.4s\n" + "sqadd v22.4s, v22.4s, v16.4s\n" + "and v16.16b, v21.16b, v5.16b\n" + "and v17.16b, v20.16b, v5.16b\n" + "srshl v23.4s, v23.4s, v5.4s\n" + "srshl v22.4s, v22.4s, v5.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "add v23.4s, v23.4s, v8.4s\n" + "add v22.4s, v22.4s, v8.4s\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "sqadd v20.4s, v20.4s, v17.4s\n" + "smax v23.4s, v23.4s, v12.4s\n" + "smax v22.4s, v22.4s, v12.4s\n" + "srshl v21.4s, v21.4s, v5.4s\n" + "srshl v20.4s, v20.4s, v5.4s\n" + "smin v23.4s, v23.4s, v11.4s\n" + "smin v22.4s, v22.4s, v11.4s\n" + "add v21.4s, v21.4s, v8.4s\n" + "add v20.4s, v20.4s, v8.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smax v21.4s, v21.4s, v12.4s\n" + "smax v20.4s, v20.4s, v12.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v21.4s, v21.4s, v11.4s\n" + "smin v20.4s, v20.4s, v11.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "sshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "sqrdmulh v19.4s, v19.4s, v6.4s\n" + "and v16.16b, v19.16b, v5.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "srshl v19.4s, v19.4s, v5.4s\n" + "add v19.4s, v19.4s, v8.4s\n" + "smax v19.4s, v19.4s, v12.4s\n" + "smin v19.4s, v19.4s, v11.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "tbz %x[n_channels], #1, 22f\n" + "st1 { v27.h }[0], [x27], #0x2\n" + "st1 { v26.h }[0], [x26], #0x2\n" + "st1 { v25.h }[0], [x25], #0x2\n" + "st1 { v24.h }[0], [x24], #0x2\n" + "st1 { v23.h }[0], [x23], #0x2\n" + "st1 { v22.h }[0], [x22], #0x2\n" + "st1 { v21.h }[0], [x21], #0x2\n" + "st1 { v20.h }[0], [x20], #0x2\n" + "st1 { v19.h }[0], [x19], #0x2\n" + "tbz %x[n_channels], #0, 23f\n" + "st1 { v27.b }[2], [x27], #0x1\n" + "st1 { v26.b }[2], [x26], #0x1\n" + "st1 { v25.b }[2], [x25], #0x1\n" + "st1 { v24.b }[2], [x24], #0x1\n" + "st1 { v23.b }[2], [x23], #0x1\n" + "st1 { v22.b }[2], [x22], #0x1\n" + "st1 { v21.b }[2], [x21], #0x1\n" + "st1 { v20.b }[2], [x20], #0x1\n" + "st1 { v19.b }[2], [x19], #0x1\n" + "b 23f\n" + "22:" // Oddments: Store: Bit 1: Unset + "tbz %x[n_channels], #0, 23f\n" + "st1 { v27.b }[0], [x27], #0x1\n" + "st1 { v26.b }[0], [x26], #0x1\n" + "st1 { v25.b }[0], [x25], #0x1\n" + "st1 { v24.b }[0], [x24], #0x1\n" + "st1 { v23.b }[0], [x23], #0x1\n" + "st1 { v22.b }[0], [x22], #0x1\n" + "st1 { v21.b }[0], [x21], #0x1\n" + "st1 { v20.b }[0], [x20], #0x1\n" + "st1 { v19.b }[0], [x19], #0x1\n" + "23:" // Oddments: Store: Bit 1: End + + "24:" // End + + : [params] "+&r" (params) + : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp new file mode 100644 index 0000000000..802030573e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); + +struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + + constexpr static unsigned int output_rows(void) { return 2; }; + constexpr static unsigned int output_cols(void) { return 8; }; + + constexpr static unsigned int output_col_regs(void) { return 2; }; + + kern_type kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + + a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..152999dd1a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -0,0 +1,1484 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +namespace arm_conv { +namespace depthwise { + +void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const int8_t *weights, + const int32_t *bias, + const unsigned int kernel_points, + const unsigned int n_output_channels, + const int32_t *per_channel_left_shifts, + const int32_t *per_channel_muls, + const int32_t *per_channel_right_shifts, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov x9, #0x0\n" + "add x19, %x[qp], %[offsetof_Requantize32_minval]\n" + "ld1r { v14.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n" + "ld1r { v13.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_a_offset]\n" + "ld1r { v12.16b }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n" + "ld1r { v11.16b }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n" + "ld1r { v10.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n" + "ld1r { v9.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n" + "ld1r { v8.4s }, [x19]\n" + "add x19, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n" + "ld1r { v7.4s }, [x19]\n" + "lsr x28, %x[n_output_channels], #0x2\n" + "cbz x28, 9f\n" + "1:" // Output channel loop + "movi v16.4s, #0x0\n" + "cbz %x[bias], 2f\n" + "lsl x19, x9, #0x2\n" + "ldr q16, [%x[bias], x19]\n" + "2:" // Output channel loop: Load bias: Done + "mov v6.16b, v16.16b\n" + "mov v5.16b, v16.16b\n" + "mov v4.16b, v16.16b\n" + "mov v31.16b, v16.16b\n" + "mov v30.16b, v16.16b\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "cbz %x[rq_mul_ptr], 3f\n" + "lsl x19, x9, #0x2\n" + "ldr q8, [%x[rq_mul_ptr], x19]\n" + "ldr q7, [%x[rq_right_shift_ptr], x19]\n" + "cbz %x[rq_left_shift_ptr], 3f\n" + "ldr q9, [%x[rq_left_shift_ptr], x19]\n" + "3:" // Output channel loop: Load quantization parameters: Done + "ldr s17, [%x[weights]], #0x4\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "mov x19, %x[inptrs]\n" + "ldp x25, x27, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "ldr d3, [x25, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr d2, [x27, #0x0]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "cbz x20, 7f\n" + "ldp x25, x27, [x19], #0x10\n" + "ldr s16, [%x[weights]], #0x4\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "ldr d1, [x25, #0x0]\n" + "subs x20, x20, #0x1\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr d0, [x27, #0x0]\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "beq 5f\n" + "4:" // Output channel loop: Kernel loop + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "ldr d1, [x25, #0x0]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "ldr d0, [x27, #0x0]\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr s16, [%x[weights]], #0x4\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "bgt 4b\n" + "5:" // Output channel loop: Kernel loop tail + "tbnz %x[kernel_points], #0, 6f\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "b 8f\n" + "6:" // Output channel loop: Odd tail + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "b 8f\n" + "7:" // Output channel loop: Single kernel point + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "and v17.16b, v4.16b, v7.16b\n" + "and v16.16b, v31.16b, v7.16b\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str s6, [x19, x9]\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "add v4.4s, v4.4s, v10.4s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "str s5, [x20, x9]\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "smin v4.4s, v4.4s, v13.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "smin v31.4s, v31.4s, v13.4s\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "and v16.16b, v30.16b, v7.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "str s4, [x21, x9]\n" + "smax v31.4s, v31.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "str s31, [x22, x9]\n" + "and v17.16b, v29.16b, v7.16b\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v28.16b, v7.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v29.4s, v29.4s, v10.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "str s30, [x23, x9]\n" + "smin v29.4s, v29.4s, v13.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "add v28.4s, v28.4s, v10.4s\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "str s29, [x24, x9]\n" + "smax v28.4s, v28.4s, v14.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "str s28, [x25, x9]\n" + "smin v27.4s, v27.4s, v13.4s\n" + "and v17.16b, v26.16b, v7.16b\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v25.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "str s27, [x26, x9]\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "and v17.16b, v24.16b, v7.16b\n" + "add v26.4s, v26.4s, v10.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v23.16b, v7.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "smin v25.4s, v25.4s, v13.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "str s26, [x19, x9]\n" + "smax v25.4s, v25.4s, v14.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "str s25, [x20, x9]\n" + "smin v24.4s, v24.4s, v13.4s\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "str s24, [x21, x9]\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v16.16b, v21.16b, v7.16b\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smax v23.4s, v23.4s, v14.4s\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str s23, [x22, x9]\n" + "add v22.4s, v22.4s, v10.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "smin v22.4s, v22.4s, v13.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "add v21.4s, v21.4s, v10.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str s22, [x23, x9]\n" + "smax v21.4s, v21.4s, v14.4s\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "str s21, [x24, x9]\n" + "smin v20.4s, v20.4s, v13.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "str s20, [x25, x9]\n" + "add v19.4s, v19.4s, v10.4s\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str s19, [x26, x9]\n" + "8:" // Output channel loop: Done + "add x9, x9, #0x4\n" + "cmp x9, x28, LSL #2\n" + "blt 1b\n" + "tst %x[n_output_channels], #0x3\n" + "beq 26f\n" + "9:" // Output channel oddments + "movi v16.4s, #0x0\n" + "cbz %x[bias], 12f\n" + "add x19, %x[bias], x9, LSL #2\n" + "tbz %x[n_output_channels], #1, 10f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 11f\n" + "ld1 { v16.s }[2], [x19]\n" + "b 11f\n" + "10:" // Output channel oddments: Load bias: Bit 1: Unset + "tbz %x[n_output_channels], #0, 11f\n" + "ld1 { v16.s }[0], [x19]\n" + "11:" // Output channel oddments: Load bias: Bit 1: End + + "12:" // Output channel oddments: Load bias: Done + "mov v6.16b, v16.16b\n" + "mov v5.16b, v16.16b\n" + "mov v4.16b, v16.16b\n" + "mov v31.16b, v16.16b\n" + "mov v30.16b, v16.16b\n" + "mov v29.16b, v16.16b\n" + "mov v28.16b, v16.16b\n" + "mov v27.16b, v16.16b\n" + "mov v26.16b, v16.16b\n" + "mov v25.16b, v16.16b\n" + "mov v24.16b, v16.16b\n" + "mov v23.16b, v16.16b\n" + "mov v22.16b, v16.16b\n" + "mov v21.16b, v16.16b\n" + "mov v20.16b, v16.16b\n" + "mov v19.16b, v16.16b\n" + "cbz %x[rq_mul_ptr], 18f\n" + "add x21, %x[rq_mul_ptr], x9, LSL #2\n" + "add x20, %x[rq_right_shift_ptr], x9, LSL #2\n" + "add x19, %x[rq_left_shift_ptr], x9, LSL #2\n" + "cbz %x[rq_left_shift_ptr], 15f\n" + "tbz %x[n_output_channels], #1, 13f\n" + "ld1 { v8.d }[0], [x21], #0x8\n" + "ld1 { v7.d }[0], [x20], #0x8\n" + "ld1 { v9.d }[0], [x19], #0x8\n" + "tbz %x[n_output_channels], #0, 14f\n" + "ld1 { v8.s }[2], [x21], #0x4\n" + "ld1 { v7.s }[2], [x20], #0x4\n" + "ld1 { v9.s }[2], [x19], #0x4\n" + "b 14f\n" + "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset + "tbz %x[n_output_channels], #0, 14f\n" + "ld1 { v8.s }[0], [x21], #0x4\n" + "ld1 { v7.s }[0], [x20], #0x4\n" + "ld1 { v9.s }[0], [x19], #0x4\n" + "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End + "b 18f\n" + "15:" // Output channel oddments: Load quantization parameters: No left shift + "tbz %x[n_output_channels], #1, 16f\n" + "ld1 { v8.d }[0], [x21], #0x8\n" + "ld1 { v7.d }[0], [x20], #0x8\n" + "tbz %x[n_output_channels], #0, 17f\n" + "ld1 { v8.s }[2], [x21], #0x4\n" + "ld1 { v7.s }[2], [x20], #0x4\n" + "b 17f\n" + "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset + "tbz %x[n_output_channels], #0, 17f\n" + "ld1 { v8.s }[0], [x21], #0x4\n" + "ld1 { v7.s }[0], [x20], #0x4\n" + "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End + + "18:" // Output channel oddments: Load quantization parameters: Done + "ldr s17, [%x[weights]], #0x4\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "mov x19, %x[inptrs]\n" + "ldp x25, x27, [x19], #0x10\n" + "lsr x20, %x[kernel_points], #0x1\n" + "ldr d3, [x25, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr d2, [x27, #0x0]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "cbz x20, 22f\n" + "ldp x25, x27, [x19], #0x10\n" + "ldr s16, [%x[weights]], #0x4\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "ldr d1, [x25, #0x0]\n" + "subs x20, x20, #0x1\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr d0, [x27, #0x0]\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "beq 20f\n" + "19:" // Output channel oddments: Kernel loop + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "subs x20, x20, #0x1\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "ldr d1, [x25, #0x0]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "ldr d0, [x27, #0x0]\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr s16, [%x[weights]], #0x4\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "ssubl v16.8h, v16.8b, v11.8b\n" + "bgt 19b\n" + "20:" // Output channel oddments: Kernel loop tail + "tbnz %x[kernel_points], #0, 21f\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "b 23f\n" + "21:" // Output channel oddments: Odd tail + "smlal v6.4s, v17.4h, v3.h[0]\n" + "ldp x25, x27, [x19], #0x10\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "ldr d3, [x25, #0x0]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "ldr d2, [x27, #0x0]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr s17, [%x[weights]], #0x4\n" + "smlal v6.4s, v16.4h, v1.h[0]\n" + "smlal v5.4s, v16.4h, v1.h[1]\n" + "smlal v4.4s, v16.4h, v1.h[2]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "ssubl v17.8h, v17.8b, v11.8b\n" + "smlal v31.4s, v16.4h, v1.h[3]\n" + "smlal v30.4s, v16.4h, v1.h[4]\n" + "smlal v29.4s, v16.4h, v1.h[5]\n" + "smlal v28.4s, v16.4h, v1.h[6]\n" + "smlal v27.4s, v16.4h, v1.h[7]\n" + "smlal v26.4s, v16.4h, v0.h[0]\n" + "smlal v25.4s, v16.4h, v0.h[1]\n" + "smlal v24.4s, v16.4h, v0.h[2]\n" + "smlal v23.4s, v16.4h, v0.h[3]\n" + "smlal v22.4s, v16.4h, v0.h[4]\n" + "smlal v21.4s, v16.4h, v0.h[5]\n" + "smlal v20.4s, v16.4h, v0.h[6]\n" + "smlal v19.4s, v16.4h, v0.h[7]\n" + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "b 23f\n" + "22:" // Output channel oddments: Single kernel point + "smlal v6.4s, v17.4h, v3.h[0]\n" + "smlal v5.4s, v17.4h, v3.h[1]\n" + "smlal v4.4s, v17.4h, v3.h[2]\n" + "smlal v31.4s, v17.4h, v3.h[3]\n" + "smlal v30.4s, v17.4h, v3.h[4]\n" + "smlal v29.4s, v17.4h, v3.h[5]\n" + "smlal v28.4s, v17.4h, v3.h[6]\n" + "smlal v27.4s, v17.4h, v3.h[7]\n" + "smlal v26.4s, v17.4h, v2.h[0]\n" + "smlal v25.4s, v17.4h, v2.h[1]\n" + "smlal v24.4s, v17.4h, v2.h[2]\n" + "smlal v23.4s, v17.4h, v2.h[3]\n" + "smlal v22.4s, v17.4h, v2.h[4]\n" + "smlal v21.4s, v17.4h, v2.h[5]\n" + "smlal v20.4s, v17.4h, v2.h[6]\n" + "smlal v19.4s, v17.4h, v2.h[7]\n" + "23:" // Output channel oddments: Done + "sshl v6.4s, v6.4s, v9.4s\n" + "sshl v5.4s, v5.4s, v9.4s\n" + "sshl v4.4s, v4.4s, v9.4s\n" + "sqrdmulh v6.4s, v6.4s, v8.4s\n" + "sqrdmulh v5.4s, v5.4s, v8.4s\n" + "sqrdmulh v4.4s, v4.4s, v8.4s\n" + "sshl v31.4s, v31.4s, v9.4s\n" + "and v18.16b, v6.16b, v7.16b\n" + "and v16.16b, v5.16b, v7.16b\n" + "and v17.16b, v4.16b, v7.16b\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v18.4s\n" + "sqadd v5.4s, v5.4s, v16.4s\n" + "sqadd v4.4s, v4.4s, v17.4s\n" + "sqrdmulh v31.4s, v31.4s, v8.4s\n" + "srshl v6.4s, v6.4s, v7.4s\n" + "srshl v5.4s, v5.4s, v7.4s\n" + "srshl v4.4s, v4.4s, v7.4s\n" + "and v16.16b, v31.16b, v7.16b\n" + "add v6.4s, v6.4s, v10.4s\n" + "add v5.4s, v5.4s, v10.4s\n" + "add v4.4s, v4.4s, v10.4s\n" + "smin v6.4s, v6.4s, v13.4s\n" + "smin v5.4s, v5.4s, v13.4s\n" + "smin v4.4s, v4.4s, v13.4s\n" + "smax v6.4s, v6.4s, v14.4s\n" + "smax v5.4s, v5.4s, v14.4s\n" + "smax v4.4s, v4.4s, v14.4s\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "uzp1 v5.16b, v5.16b, v5.16b\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v4.16b, v4.16b, v4.16b\n" + "sshl v30.4s, v30.4s, v9.4s\n" + "sqadd v31.4s, v31.4s, v16.4s\n" + "sqrdmulh v30.4s, v30.4s, v8.4s\n" + "sshl v29.4s, v29.4s, v9.4s\n" + "sshl v28.4s, v28.4s, v9.4s\n" + "srshl v31.4s, v31.4s, v7.4s\n" + "and v16.16b, v30.16b, v7.16b\n" + "sqrdmulh v29.4s, v29.4s, v8.4s\n" + "sqrdmulh v28.4s, v28.4s, v8.4s\n" + "add v31.4s, v31.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "and v17.16b, v29.16b, v7.16b\n" + "smin v31.4s, v31.4s, v13.4s\n" + "sqadd v30.4s, v30.4s, v16.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "smax v31.4s, v31.4s, v14.4s\n" + "and v16.16b, v28.16b, v7.16b\n" + "srshl v30.4s, v30.4s, v7.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "sqadd v29.4s, v29.4s, v17.4s\n" + "uzp1 v31.16b, v31.16b, v31.16b\n" + "add v30.4s, v30.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "srshl v29.4s, v29.4s, v7.4s\n" + "smin v30.4s, v30.4s, v13.4s\n" + "sqadd v28.4s, v28.4s, v16.4s\n" + "sshl v27.4s, v27.4s, v9.4s\n" + "smax v30.4s, v30.4s, v14.4s\n" + "add v29.4s, v29.4s, v10.4s\n" + "srshl v28.4s, v28.4s, v7.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "smin v29.4s, v29.4s, v13.4s\n" + "uzp1 v30.16b, v30.16b, v30.16b\n" + "add v28.4s, v28.4s, v10.4s\n" + "smax v29.4s, v29.4s, v14.4s\n" + "sqrdmulh v27.4s, v27.4s, v8.4s\n" + "smin v28.4s, v28.4s, v13.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "sshl v26.4s, v26.4s, v9.4s\n" + "uzp1 v29.16b, v29.16b, v29.16b\n" + "smax v28.4s, v28.4s, v14.4s\n" + "and v16.16b, v27.16b, v7.16b\n" + "sqrdmulh v26.4s, v26.4s, v8.4s\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v28.16b, v28.16b, v28.16b\n" + "and v17.16b, v26.16b, v7.16b\n" + "sqadd v27.4s, v27.4s, v16.4s\n" + "sshl v25.4s, v25.4s, v9.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sqrdmulh v25.4s, v25.4s, v8.4s\n" + "srshl v27.4s, v27.4s, v7.4s\n" + "sqadd v26.4s, v26.4s, v17.4s\n" + "sshl v24.4s, v24.4s, v9.4s\n" + "and v16.16b, v25.16b, v7.16b\n" + "add v27.4s, v27.4s, v10.4s\n" + "srshl v26.4s, v26.4s, v7.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v27.4s, v27.4s, v13.4s\n" + "sqrdmulh v24.4s, v24.4s, v8.4s\n" + "add v26.4s, v26.4s, v10.4s\n" + "smax v27.4s, v27.4s, v14.4s\n" + "sqadd v25.4s, v25.4s, v16.4s\n" + "smin v26.4s, v26.4s, v13.4s\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "and v17.16b, v24.16b, v7.16b\n" + "uzp1 v27.16b, v27.16b, v27.16b\n" + "smax v26.4s, v26.4s, v14.4s\n" + "srshl v25.4s, v25.4s, v7.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "sshl v23.4s, v23.4s, v9.4s\n" + "uzp1 v26.16b, v26.16b, v26.16b\n" + "add v25.4s, v25.4s, v10.4s\n" + "sqadd v24.4s, v24.4s, v17.4s\n" + "sqrdmulh v23.4s, v23.4s, v8.4s\n" + "smin v25.4s, v25.4s, v13.4s\n" + "sshl v22.4s, v22.4s, v9.4s\n" + "srshl v24.4s, v24.4s, v7.4s\n" + "smax v25.4s, v25.4s, v14.4s\n" + "and v16.16b, v23.16b, v7.16b\n" + "sqrdmulh v22.4s, v22.4s, v8.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "add v24.4s, v24.4s, v10.4s\n" + "uzp1 v25.16b, v25.16b, v25.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "smin v24.4s, v24.4s, v13.4s\n" + "and v17.16b, v22.16b, v7.16b\n" + "sqadd v23.4s, v23.4s, v16.4s\n" + "smax v24.4s, v24.4s, v14.4s\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "sshl v21.4s, v21.4s, v9.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "srshl v23.4s, v23.4s, v7.4s\n" + "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqadd v22.4s, v22.4s, v17.4s\n" + "sqrdmulh v21.4s, v21.4s, v8.4s\n" + "add v23.4s, v23.4s, v10.4s\n" + "sshl v20.4s, v20.4s, v9.4s\n" + "srshl v22.4s, v22.4s, v7.4s\n" + "smin v23.4s, v23.4s, v13.4s\n" + "and v16.16b, v21.16b, v7.16b\n" + "sqrdmulh v20.4s, v20.4s, v8.4s\n" + "smax v23.4s, v23.4s, v14.4s\n" + "add v22.4s, v22.4s, v10.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "smin v22.4s, v22.4s, v13.4s\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v21.4s, v21.4s, v16.4s\n" + "smax v22.4s, v22.4s, v14.4s\n" + "and v16.16b, v20.16b, v7.16b\n" + "sshl v19.4s, v19.4s, v9.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "srshl v21.4s, v21.4s, v7.4s\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "sqrdmulh v19.4s, v19.4s, v8.4s\n" + "add v21.4s, v21.4s, v10.4s\n" + "sqadd v20.4s, v20.4s, v16.4s\n" + "smin v21.4s, v21.4s, v13.4s\n" + "and v16.16b, v19.16b, v7.16b\n" + "srshl v20.4s, v20.4s, v7.4s\n" + "smax v21.4s, v21.4s, v14.4s\n" + "sshr v16.4s, v16.4s, #0x1f\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "add v20.4s, v20.4s, v10.4s\n" + "sqadd v19.4s, v19.4s, v16.4s\n" + "uzp1 v21.16b, v21.16b, v21.16b\n" + "smin v20.4s, v20.4s, v13.4s\n" + "srshl v19.4s, v19.4s, v7.4s\n" + "smax v20.4s, v20.4s, v14.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "add v19.4s, v19.4s, v10.4s\n" + "uzp1 v20.16b, v20.16b, v20.16b\n" + "smin v19.4s, v19.4s, v13.4s\n" + "smax v19.4s, v19.4s, v14.4s\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "tbz %x[n_output_channels], #1, 24f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.h }[0], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.h }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.h }[0], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.h }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.h }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.h }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.h }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.h }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.h }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.h }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.h }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.h }[0], [x25]\n" + "add x9, x9, #0x2\n" + "st1 { v19.h }[0], [x26]\n" + "tbz %x[n_output_channels], #0, 25f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.b }[2], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.b }[2], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.b }[2], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.b }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.b }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.b }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.b }[2], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.b }[2], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.b }[2], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.b }[2], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.b }[2], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.b }[2], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.b }[2], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.b }[2], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.b }[2], [x25]\n" + "st1 { v19.b }[2], [x26]\n" + "b 25f\n" + "24:" // Output channel oddments: Done: Store: Bit 1: Unset + "tbz %x[n_output_channels], #0, 25f\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "add x19, x19, x9\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "add x20, x20, x9\n" + "st1 { v6.b }[0], [x19]\n" + "add x21, x21, x9\n" + "st1 { v5.b }[0], [x20]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "add x22, x22, x9\n" + "st1 { v4.b }[0], [x21]\n" + "add x23, x23, x9\n" + "st1 { v31.b }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "add x24, x24, x9\n" + "st1 { v30.b }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "add x25, x25, x9\n" + "st1 { v29.b }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "add x26, x26, x9\n" + "st1 { v28.b }[0], [x25]\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "add x19, x19, x9\n" + "st1 { v27.b }[0], [x26]\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "add x20, x20, x9\n" + "st1 { v26.b }[0], [x19]\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "add x21, x21, x9\n" + "st1 { v25.b }[0], [x20]\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "add x22, x22, x9\n" + "st1 { v24.b }[0], [x21]\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "add x23, x23, x9\n" + "st1 { v23.b }[0], [x22]\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "add x24, x24, x9\n" + "st1 { v22.b }[0], [x23]\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "add x25, x25, x9\n" + "st1 { v21.b }[0], [x24]\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "add x26, x26, x9\n" + "st1 { v20.b }[0], [x25]\n" + "st1 { v19.b }[0], [x26]\n" + "25:" // Output channel oddments: Done: Store: Bit 1: End + + "26:" // Done + + : [weights] "+&r" (weights) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..c444472c68 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + + sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..b788c705e5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x17, #0x0\n" + "mov x16, #0x0\n" + "1:" // Tile loop + "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x23, #0x2\n" + "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x15, #0x2\n" + "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x13, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cnth x12\n" + "ldr x11, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x21, XZR, x12\n" + "ldr x10, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x17, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x16, x11, x19\n" // offset += tile_j * ld_input_col + "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x23\n" // offset *= kernel_stride * output_size + "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x10, x10, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x27, x10, x22, LSL #1\n" + "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x26, x27, x22, LSL #1\n" + "ld1h { z16.h }, p3/Z, [x14]\n" // Load from weights and bias + "mov z31.d, z16.d\n" + "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n" // Load from weights and bias + "add x25, x26, x22, LSL #1\n" + "mov z30.d, z16.d\n" + "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n" // Load from weights and bias + "add x24, x11, x11\n" + "mov z29.d, z16.d\n" + "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n" // Load from weights and bias + "add x23, x24, x11\n" + "mov z28.d, z16.d\n" + "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n" // Load from weights and bias + "mul x19, x17, x20\n" // offset = tile_i * ld_output_row + "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n" // Load from weights and bias + "madd x19, x16, x9, x19\n" // offset += tile_j * ld_output_col + "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n" // Load from weights and bias + "mul x19, x19, x15\n" // offset *= output_tile_size + "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n" // Load from weights and bias + "add x28, x28, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z9.h }, p2/Z, [x27, x11, LSL #1]\n" // Load input point (1, 1) + "ld1h { z10.h }, p2/Z, [x10]\n" // Load input point (0, 0) + "add x22, x28, x20, LSL #1\n" + "ld1h { z11.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (0, 3) + "addvl x14, x14, #16\n" + "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (1, 2) + "cmp x12, %x[n_channels]\n" + "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n" // Load from weights and bias + "addvl x14, x14, #-6\n" + "ld1h { z13.h }, p2/Z, [x26, x11, LSL #1]\n" // Load input point (2, 1) + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.h, p3/M, z4.h, z9.h\n" + "ld1h { z16.h }, p3/Z, [x14]\n" // Load from weights and bias + "whilelt p1.h, x12, %x[n_channels]\n" + "fmla z30.h, p3/M, z3.h, z9.h\n" + "inch x21\n" + "fmla z29.h, p3/M, z1.h, z9.h\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x25]\n" // Load input point (3, 0) + "inch x13\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (2, 2) + "inch x12\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n" // Load input point (3, 3) + "fmla z29.h, p3/M, z2.h, z12.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z31.h, p3/M, z5.h, z12.h\n" + "fmla z30.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n" // Load input point (0, 1) + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (0, 2) + "addvl x10, x10, #1\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "fmla z31.h, p3/M, z7.h, z13.h\n" + "fmla z30.h, p3/M, z6.h, z13.h\n" + "fmla z29.h, p3/M, z4.h, z13.h\n" + "fmla z28.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (1, 0) + "fmla z31.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (1, 3) + "addvl x27, x27, #1\n" + "fmla z29.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x14, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z9.h\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x26]\n" // Load input point (2, 0) + "ld1h { z1.h }, p3/Z, [x14, #2, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x14, #1, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z2.h, z12.h\n" + "ld1h { z2.h }, p3/Z, [x14, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z8.h, z10.h\n" + "fmla z30.h, p3/M, z7.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (2, 3) + "addvl x26, x26, #1\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "ld1h { z13.h }, p1/Z, [x26, x11, LSL #1]\n" // Load input point (2, 1) + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n" // Load input point (3, 1) + "fmla z28.h, p3/M, z5.h, z10.h\n" + "ld1h { z3.h }, p3/Z, [x14, #4, MUL VL]\n" // Load from weights and bias + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n" // Load input point (3, 2) + "whilelt p2.h, x13, %x[n_channels]\n" + "fmla z29.h, p3/M, z7.h, z11.h\n" + "ld1h { z5.h }, p3/Z, [x14, #6, MUL VL]\n" // Load from weights and bias + "addvl x25, x25, #1\n" + "fmla z31.h, p3/M, z6.h, z9.h\n" + "ld1h { z9.h }, p1/Z, [x27, x11, LSL #1]\n" // Load input point (1, 1) + "cmp x12, %x[n_channels]\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "ld1h { z10.h }, p1/Z, [x10]\n" // Load input point (0, 0) + "fmla z28.h, p3/M, z6.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x10, x23, LSL #1]\n" // Load input point (0, 3) + "ld1h { z6.h }, p3/Z, [x14, #7, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z8.h, z12.h\n" + "addvl x14, x14, #16\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "ld1h { z8.h }, p3/Z, [x14, #-7, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z7.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x27, x24, LSL #1]\n" // Load input point (1, 2) + "fmax z30.h, p3/M, z30.h, z18.h\n" + "ld1h { z7.h }, p3/Z, [x14, #-8, MUL VL]\n" // Load from weights and bias + "addvl x14, x14, #-6\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x28]\n" // Store output point (0, 0) + "mov z31.d, z16.d\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "st1h { z30.h }, p0, [x28, x9, LSL #1]\n" // Store output point (0, 1) + "mov z30.d, z16.d\n" + "addvl x28, x28, #1\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0) + "mov z29.d, z16.d\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x22, x9, LSL #1]\n" // Store output point (1, 1) + "mov z28.d, z16.d\n" + "addvl x22, x22, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.h, p3/M, z4.h, z9.h\n" + "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov p0.b, p2.b\n" + "fmla z30.h, p3/M, z3.h, z9.h\n" + "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x21, x17, #0x1\n" + "fmla z29.h, p3/M, z1.h, z9.h\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x25]\n" // Load input point (3, 0) + "add x16, x16, #0x1\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (2, 2) + "fmla z30.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x25, x23, LSL #1]\n" // Load input point (3, 3) + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "cmp x16, x19\n" + "fmla z31.h, p3/M, z5.h, z12.h\n" + "fmla z30.h, p3/M, z4.h, z12.h\n" + "csel x16, x16, XZR, LT\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10, x11, LSL #1]\n" // Load input point (0, 1) + "csel x17, x17, x21, LT\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (0, 2) + "cmp x17, x20\n" + "fmla z31.h, p3/M, z7.h, z13.h\n" + "fmla z30.h, p3/M, z6.h, z13.h\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "fmla z29.h, p3/M, z4.h, z13.h\n" + "fmla z31.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (1, 3) + "fmla z28.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (1, 0) + "fmla z29.h, p3/M, z5.h, z10.h\n" + "fmla z31.h, p3/M, z2.h, z9.h\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x26]\n" // Load input point (2, 0) + "fmla z28.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "fmla z31.h, p3/M, z8.h, z10.h\n" + "fmla z30.h, p3/M, z7.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (2, 3) + "fmla z28.h, p3/M, z2.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x25, x11, LSL #1]\n" // Load input point (3, 1) + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x25, x24, LSL #1]\n" // Load input point (3, 2) + "fmla z28.h, p3/M, z5.h, z10.h\n" + "fmla z29.h, p3/M, z7.h, z11.h\n" + "fmla z31.h, p3/M, z6.h, z9.h\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "fmla z28.h, p3/M, z6.h, z11.h\n" + "fmla z29.h, p3/M, z8.h, z12.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x28]\n" // Store output point (0, 0) + "fmin z30.h, p3/M, z30.h, z17.h\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x28, x9, LSL #1]\n" // Store output point (0, 1) + "fmax z28.h, p3/M, z28.h, z18.h\n" + "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0) + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x22, x9, LSL #1]\n" // Store output point (1, 1) + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..d8f905b33a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[16]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[0]; + inptrs[1] = input_ptrs[1]; + inptrs[2] = input_ptrs[2]; + inptrs[3] = input_ptrs[3]; + inptrs[4] = input_ptrs[4]; + inptrs[5] = input_ptrs[5]; + inptrs[6] = input_ptrs[6]; + inptrs[7] = input_ptrs[7]; + inptrs[8] = input_ptrs[8]; + inptrs[9] = input_ptrs[9]; + inptrs[10] = input_ptrs[10]; + inptrs[11] = input_ptrs[11]; + inptrs[12] = input_ptrs[12]; + inptrs[13] = input_ptrs[13]; + inptrs[14] = input_ptrs[14]; + inptrs[15] = input_ptrs[15]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x3, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n" + "add x19, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "mov x5, #0x0\n" + "ldp x6, x7, [x19, #0x0]\n" + "cnth x8\n" + "ldp x17, x16, [x19, #0x10]\n" + "sub x15, XZR, x8\n" + "ldp x14, x13, [x19, #0x20]\n" + "whilelt p2.h, XZR, %x[n_channels]\n" + "ldp x12, x11, [x19, #0x30]\n" + "cmp x8, %x[n_channels]\n" + "ldp x10, x9, [x19, #0x40]\n" + "ldp x28, x27, [x19, #0x50]\n" + "ldp x26, x25, [x19, #0x60]\n" + "ldp x24, x23, [x19, #0x70]\n" + "ldp x22, x21, [x3, #0x0]\n" + "ldp x20, x19, [x3, #0x10]\n" + "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "ld1h { z16.h }, p3/Z, [x4]\n" // Load from weights and bias + "mov z31.d, z16.d\n" + "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias + "mov z30.d, z16.d\n" + "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias + "mov z29.d, z16.d\n" + "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias + "mov z28.d, z16.d\n" + "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias + "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias + "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias + "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias + "addvl x4, x4, #16\n" + "ld1h { z9.h }, p2/Z, [x13, x5, LSL #1]\n" + "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias + "addvl x4, x4, #-6\n" + "ld1h { z10.h }, p2/Z, [x6, x5, LSL #1]\n" + "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n" + "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n" + "ld1h { z13.h }, p2/Z, [x9, x5, LSL #1]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.h, p3/M, z4.h, z9.h\n" + "ld1h { z16.h }, p3/Z, [x4]\n" // Load from weights and bias + "whilelt p1.h, x8, %x[n_channels]\n" + "fmla z30.h, p3/M, z3.h, z9.h\n" + "inch x15\n" + "fmla z29.h, p3/M, z1.h, z9.h\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x26, x5, LSL #1]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x23, x5, LSL #1]\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z31.h, p3/M, z5.h, z12.h\n" + "fmla z30.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x7, x5, LSL #1]\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x17, x5, LSL #1]\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "fmla z31.h, p3/M, z7.h, z13.h\n" + "fmla z30.h, p3/M, z6.h, z13.h\n" + "fmla z29.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p1/Z, [x9, x8, LSL #1]\n" + "fmla z28.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x14, x5, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n" + "fmla z29.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z9.h\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x10, x5, LSL #1]\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z2.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias + "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z8.h, z10.h\n" + "fmla z30.h, p3/M, z7.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x27, x5, LSL #1]\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x25, x5, LSL #1]\n" + "fmla z28.h, p3/M, z5.h, z10.h\n" + "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n" + "inch x5\n" + "fmla z29.h, p3/M, z7.h, z11.h\n" + "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias + "whilelt p2.h, x5, %x[n_channels]\n" + "fmla z31.h, p3/M, z6.h, z9.h\n" + "ld1h { z9.h }, p1/Z, [x13, x8, LSL #1]\n" + "fmla z28.h, p3/M, z6.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x16, x8, LSL #1]\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "ld1h { z10.h }, p1/Z, [x6, x8, LSL #1]\n" + "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z8.h, z12.h\n" + "addvl x4, x4, #16\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x12, x8, LSL #1]\n" + "inch x8\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias + "cmp x8, %x[n_channels]\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias + "addvl x4, x4, #-6\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x22, x15, LSL #1]\n" + "mov z31.d, z16.d\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "st1h { z30.h }, p0, [x21, x15, LSL #1]\n" + "mov z30.d, z16.d\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z29.h }, p0, [x20, x15, LSL #1]\n" + "mov z29.d, z16.d\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x19, x15, LSL #1]\n" + "mov z28.d, z16.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.h, p3/M, z4.h, z9.h\n" + "inch x15\n" + "fmla z30.h, p3/M, z3.h, z9.h\n" + "mov p0.b, p2.b\n" + "fmla z29.h, p3/M, z1.h, z9.h\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x26, x5, LSL #1]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x23, x5, LSL #1]\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z31.h, p3/M, z5.h, z12.h\n" + "fmla z30.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x7, x5, LSL #1]\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x17, x5, LSL #1]\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "fmla z31.h, p3/M, z7.h, z13.h\n" + "fmla z30.h, p3/M, z6.h, z13.h\n" + "fmla z29.h, p3/M, z4.h, z13.h\n" + "fmla z28.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x14, x5, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n" + "fmla z29.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "fmla z31.h, p3/M, z2.h, z9.h\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x10, x5, LSL #1]\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "fmla z28.h, p3/M, z2.h, z12.h\n" + "fmla z31.h, p3/M, z8.h, z10.h\n" + "fmla z30.h, p3/M, z7.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x27, x5, LSL #1]\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x25, x5, LSL #1]\n" + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n" + "fmla z28.h, p3/M, z5.h, z10.h\n" + "fmla z29.h, p3/M, z7.h, z11.h\n" + "fmla z31.h, p3/M, z6.h, z9.h\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "fmla z28.h, p3/M, z6.h, z11.h\n" + "fmla z29.h, p3/M, z8.h, z12.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x22, x15, LSL #1]\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x21, x15, LSL #1]\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "st1h { z29.h }, p0, [x20, x15, LSL #1]\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x19, x15, LSL #1]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp new file mode 100644 index 0000000000..f5d31e63f8 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 3; + constexpr static unsigned int output_cols = 3; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + + sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..aebf0bf7ac --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x6, #0x0\n" + "mov x7, #0x0\n" + "1:" // Tile loop + "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x24, #0x3\n" + "str x7, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x23, #0x3\n" + "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x17, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cnth x16\n" + "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x21, XZR, x16\n" + "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x6, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x7, x15, x19\n" // offset += tile_j * ld_input_col + "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x24\n" // offset *= kernel_stride * output_size + "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x14, x14, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x11, x14, x22, LSL #1\n" + "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x10, x11, x22, LSL #1\n" + "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias + "mov z31.d, z16.d\n" + "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "add x9, x10, x22, LSL #1\n" + "mov z30.d, z16.d\n" + "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "add x28, x9, x22, LSL #1\n" + "mov z29.d, z16.d\n" + "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "add x27, x15, x15\n" + "mov z28.d, z16.d\n" + "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias + "add x26, x27, x15\n" + "mov z27.d, z16.d\n" + "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias + "add x25, x26, x15\n" + "mov z26.d, z16.d\n" + "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias + "mul x19, x6, x20\n" // offset = tile_i * ld_output_row + "mov z25.d, z16.d\n" + "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias + "madd x19, x7, x13, x19\n" // offset += tile_j * ld_output_col + "mov z24.d, z16.d\n" + "mul x19, x19, x23\n" // offset *= output_tile_size + "mov z23.d, z16.d\n" + "add x12, x12, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x24, x13, x13\n" + "add x23, x12, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z9.h }, p2/Z, [x10, x27, LSL #1]\n" // Load input point (2, 2) + "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (0, 0) + "addvl x8, x8, #16\n" + "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n" // Load input point (0, 4) + "cmp x16, %x[n_channels]\n" + "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias + "addvl x8, x8, #-6\n" + "ld1h { z12.h }, p2/Z, [x28]\n" // Load input point (4, 0) + "ld1h { z13.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (1, 2) + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias + "whilelt p1.h, x16, %x[n_channels]\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "inch x21\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z5.h, z9.h\n" + "inch x17\n" + "fmla z27.h, p3/M, z4.h, z9.h\n" + "inch x16\n" + "fmla z26.h, p3/M, z3.h, z9.h\n" + "fmla z25.h, p3/M, z2.h, z9.h\n" + "fmla z24.h, p3/M, z1.h, z9.h\n" + "fmla z23.h, p3/M, z0.h, z9.h\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 3) + "fmla z29.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1) + "fmla z25.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 4) + "fmla z30.h, p3/M, z4.h, z13.h\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "fmla z29.h, p3/M, z3.h, z13.h\n" + "fmla z28.h, p3/M, z2.h, z13.h\n" + "fmla z27.h, p3/M, z1.h, z13.h\n" + "fmla z26.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n" // Load input point (0, 1) + "fmla z23.h, p3/M, z8.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n" // Load input point (0, 3) + "fmla z31.h, p3/M, z7.h, z11.h\n" + "fmla z30.h, p3/M, z6.h, z11.h\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "fmla z27.h, p3/M, z3.h, z11.h\n" + "fmla z25.h, p3/M, z1.h, z11.h\n" + "fmla z24.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11]\n" // Load input point (1, 0) + "fmla z31.h, p3/M, z1.h, z13.h\n" + "fmla z30.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n" // Load input point (1, 4) + "fmla z29.h, p3/M, z1.h, z12.h\n" + "fmla z27.h, p3/M, z5.h, z10.h\n" + "fmla z26.h, p3/M, z4.h, z10.h\n" + "fmla z30.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9]\n" // Load input point (3, 0) + "fmla z29.h, p3/M, z7.h, z10.h\n" + "fmla z24.h, p3/M, z2.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n" // Load input point (3, 2) + "fmla z31.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 4) + "fmla z29.h, p3/M, z5.h, z13.h\n" + "fmla z26.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1) + "fmla z25.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n" // Load input point (1, 1) + "fmla z27.h, p3/M, z7.h, z10.h\n" + "fmla z26.h, p3/M, z6.h, z10.h\n" + "fmla z25.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z8.h, z10.h\n" + "fmla z24.h, p3/M, z4.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z26.h, p3/M, z8.h, z11.h\n" + "fmla z25.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z6.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 3) + "fmla z23.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n" // Load input point (1, 3) + "addvl x11, x11, #1\n" + "fmla z31.h, p3/M, z4.h, z12.h\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z27.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1) + "fmla z29.h, p3/M, z4.h, z11.h\n" + "fmla z30.h, p3/M, z5.h, z11.h\n" + "fmla z26.h, p3/M, z1.h, z11.h\n" + "fmla z27.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (0, 2) + "addvl x14, x14, #1\n" + "fmla z24.h, p3/M, z8.h, z13.h\n" + "ld1h { z10.h }, p1/Z, [x14]\n" // Load input point (0, 0) + "fmla z23.h, p3/M, z7.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 3) + "addvl x9, x9, #1\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z27.h, p3/M, z6.h, z12.h\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10]\n" // Load input point (2, 0) + "fmla z31.h, p3/M, z2.h, z11.h\n" + "fmla z30.h, p3/M, z1.h, z11.h\n" + "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 4) + "addvl x10, x10, #1\n" + "fmla z27.h, p3/M, z8.h, z13.h\n" + "ld1h { z9.h }, p1/Z, [x10, x27, LSL #1]\n" // Load input point (2, 2) + "fmla z26.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z5.h, z13.h\n" + "fmla z23.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n" // Load input point (4, 2) + "whilelt p2.h, x17, %x[n_channels]\n" + "fmla z31.h, p3/M, z6.h, z12.h\n" + "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias + "addvl x28, x28, #1\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias + "cmp x16, %x[n_channels]\n" + "fmla z25.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x28]\n" // Load input point (4, 0) + "fmla z29.h, p3/M, z8.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "fmla z26.h, p3/M, z5.h, z11.h\n" + "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias + "fmla z23.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n" // Load input point (0, 4) + "fmla z24.h, p3/M, z7.h, z13.h\n" + "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "fmla z25.h, p3/M, z8.h, z13.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmla z23.h, p3/M, z6.h, z13.h\n" + "ld1h { z13.h }, p1/Z, [x11, x27, LSL #1]\n" // Load input point (1, 2) + "fmax z30.h, p3/M, z30.h, z18.h\n" + "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias + "addvl x8, x8, #16\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias + "fmax z29.h, p3/M, z29.h, z18.h\n" + "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias + "addvl x8, x8, #-6\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "st1h { z31.h }, p0, [x12]\n" // Store output point (0, 0) + "mov z31.d, z16.d\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x12, x13, LSL #1]\n" // Store output point (0, 1) + "mov z30.d, z16.d\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "st1h { z29.h }, p0, [x12, x24, LSL #1]\n" // Store output point (0, 2) + "mov z29.d, z16.d\n" + "addvl x12, x12, #1\n" + "fmax z27.h, p3/M, z27.h, z18.h\n" + "fmax z26.h, p3/M, z26.h, z18.h\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x23]\n" // Store output point (1, 0) + "mov z28.d, z16.d\n" + "fmin z27.h, p3/M, z27.h, z17.h\n" + "st1h { z27.h }, p0, [x23, x13, LSL #1]\n" // Store output point (1, 1) + "mov z27.d, z16.d\n" + "fmin z26.h, p3/M, z26.h, z17.h\n" + "st1h { z26.h }, p0, [x23, x24, LSL #1]\n" // Store output point (1, 2) + "mov z26.d, z16.d\n" + "addvl x23, x23, #1\n" + "fmax z25.h, p3/M, z25.h, z18.h\n" + "fmax z24.h, p3/M, z24.h, z18.h\n" + "fmax z23.h, p3/M, z23.h, z18.h\n" + "fmin z25.h, p3/M, z25.h, z17.h\n" + "st1h { z25.h }, p0, [x22]\n" // Store output point (2, 0) + "mov z25.d, z16.d\n" + "fmin z24.h, p3/M, z24.h, z17.h\n" + "st1h { z24.h }, p0, [x22, x13, LSL #1]\n" // Store output point (2, 1) + "mov z24.d, z16.d\n" + "fmin z23.h, p3/M, z23.h, z17.h\n" + "st1h { z23.h }, p0, [x22, x24, LSL #1]\n" // Store output point (2, 2) + "mov z23.d, z16.d\n" + "addvl x22, x22, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov p0.b, p2.b\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "ldr x7, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x21, x6, #0x1\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z28.h, p3/M, z5.h, z9.h\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "add x7, x7, #0x1\n" + "fmla z27.h, p3/M, z4.h, z9.h\n" + "cmp x7, x19\n" + "fmla z26.h, p3/M, z3.h, z9.h\n" + "fmla z25.h, p3/M, z2.h, z9.h\n" + "csel x7, x7, XZR, LT\n" + "fmla z24.h, p3/M, z1.h, z9.h\n" + "csel x6, x6, x21, LT\n" + "fmla z23.h, p3/M, z0.h, z9.h\n" + "cmp x6, x20\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 3) + "fmla z29.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1) + "fmla z25.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 4) + "fmla z30.h, p3/M, z4.h, z13.h\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "fmla z29.h, p3/M, z3.h, z13.h\n" + "fmla z28.h, p3/M, z2.h, z13.h\n" + "fmla z27.h, p3/M, z1.h, z13.h\n" + "fmla z26.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n" // Load input point (0, 1) + "fmla z23.h, p3/M, z8.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n" // Load input point (0, 3) + "fmla z31.h, p3/M, z7.h, z11.h\n" + "fmla z30.h, p3/M, z6.h, z11.h\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "fmla z27.h, p3/M, z3.h, z11.h\n" + "fmla z25.h, p3/M, z1.h, z11.h\n" + "fmla z24.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11]\n" // Load input point (1, 0) + "fmla z31.h, p3/M, z1.h, z13.h\n" + "fmla z30.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x11, x25, LSL #1]\n" // Load input point (1, 4) + "fmla z29.h, p3/M, z1.h, z12.h\n" + "fmla z27.h, p3/M, z5.h, z10.h\n" + "fmla z26.h, p3/M, z4.h, z10.h\n" + "fmla z30.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9]\n" // Load input point (3, 0) + "fmla z29.h, p3/M, z7.h, z10.h\n" + "fmla z24.h, p3/M, z2.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x9, x27, LSL #1]\n" // Load input point (3, 2) + "fmla z31.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 4) + "fmla z29.h, p3/M, z5.h, z13.h\n" + "fmla z26.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1) + "fmla z25.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n" // Load input point (1, 1) + "fmla z27.h, p3/M, z7.h, z10.h\n" + "fmla z26.h, p3/M, z6.h, z10.h\n" + "fmla z25.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z8.h, z10.h\n" + "fmla z24.h, p3/M, z4.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z26.h, p3/M, z8.h, z11.h\n" + "fmla z25.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z6.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 3) + "fmla z23.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x26, LSL #1]\n" // Load input point (1, 3) + "fmla z31.h, p3/M, z4.h, z12.h\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z27.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1) + "fmla z29.h, p3/M, z4.h, z11.h\n" + "fmla z30.h, p3/M, z5.h, z11.h\n" + "fmla z26.h, p3/M, z1.h, z11.h\n" + "fmla z27.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (0, 2) + "fmla z24.h, p3/M, z8.h, z13.h\n" + "fmla z23.h, p3/M, z7.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 3) + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z27.h, p3/M, z6.h, z12.h\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10]\n" // Load input point (2, 0) + "fmla z31.h, p3/M, z2.h, z11.h\n" + "fmla z30.h, p3/M, z1.h, z11.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 4) + "fmla z27.h, p3/M, z8.h, z13.h\n" + "fmla z26.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z5.h, z13.h\n" + "fmla z23.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x27, LSL #1]\n" // Load input point (4, 2) + "fmla z31.h, p3/M, z6.h, z12.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z25.h, p3/M, z0.h, z12.h\n" + "fmla z29.h, p3/M, z8.h, z11.h\n" + "fmla z26.h, p3/M, z5.h, z11.h\n" + "fmla z23.h, p3/M, z2.h, z11.h\n" + "fmla z25.h, p3/M, z8.h, z13.h\n" + "fmla z24.h, p3/M, z7.h, z13.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmla z23.h, p3/M, z6.h, z13.h\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x12]\n" // Store output point (0, 0) + "fmin z30.h, p3/M, z30.h, z17.h\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x12, x13, LSL #1]\n" // Store output point (0, 1) + "fmax z28.h, p3/M, z28.h, z18.h\n" + "fmax z27.h, p3/M, z27.h, z18.h\n" + "st1h { z29.h }, p0, [x12, x24, LSL #1]\n" // Store output point (0, 2) + "fmax z26.h, p3/M, z26.h, z18.h\n" + "fmax z25.h, p3/M, z25.h, z18.h\n" + "fmax z24.h, p3/M, z24.h, z18.h\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x23]\n" // Store output point (1, 0) + "fmin z27.h, p3/M, z27.h, z17.h\n" + "fmin z26.h, p3/M, z26.h, z17.h\n" + "st1h { z27.h }, p0, [x23, x13, LSL #1]\n" // Store output point (1, 1) + "fmin z25.h, p3/M, z25.h, z17.h\n" + "fmin z24.h, p3/M, z24.h, z17.h\n" + "st1h { z26.h }, p0, [x23, x24, LSL #1]\n" // Store output point (1, 2) + "fmax z23.h, p3/M, z23.h, z18.h\n" + "st1h { z25.h }, p0, [x22]\n" // Store output point (2, 0) + "fmin z23.h, p3/M, z23.h, z17.h\n" + "st1h { z24.h }, p0, [x22, x13, LSL #1]\n" // Store output point (2, 1) + "st1h { z23.h }, p0, [x22, x24, LSL #1]\n" // Store output point (2, 2) + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..65ecb6d218 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[25]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[4]; + inptrs[3] = input_ptrs[20]; + inptrs[4] = input_ptrs[7]; + inptrs[5] = input_ptrs[24]; + inptrs[6] = input_ptrs[11]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[3]; + inptrs[9] = input_ptrs[13]; + inptrs[10] = input_ptrs[5]; + inptrs[11] = input_ptrs[9]; + inptrs[12] = input_ptrs[15]; + inptrs[13] = input_ptrs[17]; + inptrs[14] = input_ptrs[19]; + inptrs[15] = input_ptrs[21]; + inptrs[16] = input_ptrs[6]; + inptrs[17] = input_ptrs[8]; + inptrs[18] = input_ptrs[23]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[18]; + inptrs[22] = input_ptrs[10]; + inptrs[23] = input_ptrs[14]; + inptrs[24] = input_ptrs[22]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x6, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n" + "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "mov x17, #0x0\n" + "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "cnth x16\n" + "ld1h { z16.h }, p3/Z, [x7]\n" // Load from weights and bias + "mov z31.d, z16.d\n" + "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n" // Load from weights and bias + "sub x15, XZR, x16\n" + "mov z30.d, z16.d\n" + "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n" // Load from weights and bias + "whilelt p2.h, XZR, %x[n_channels]\n" + "mov z29.d, z16.d\n" + "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n" // Load from weights and bias + "cmp x16, %x[n_channels]\n" + "mov z28.d, z16.d\n" + "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n" // Load from weights and bias + "mov z27.d, z16.d\n" + "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n" // Load from weights and bias + "mov z26.d, z16.d\n" + "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n" // Load from weights and bias + "mov z25.d, z16.d\n" + "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n" // Load from weights and bias + "addvl x7, x7, #16\n" + "mov z24.d, z16.d\n" + "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n" // Load from weights and bias + "mov z23.d, z16.d\n" + "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n" // Load from weights and bias + "addvl x7, x7, #-6\n" + "ldp x14, x13, [x8, #0x0]\n" + "ldp x12, x11, [x8, #0x10]\n" + "ldr x10, [x8, #0x20]\n" + "ld1h { z9.h }, p2/Z, [x14, x17, LSL #1]\n" + "ld1h { z10.h }, p2/Z, [x13, x17, LSL #1]\n" + "ld1h { z11.h }, p2/Z, [x12, x17, LSL #1]\n" + "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n" + "ld1h { z13.h }, p2/Z, [x10, x17, LSL #1]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x9, [x8, #0x28]\n" + "whilelt p1.h, x16, %x[n_channels]\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "ldr x28, [x8, #0x30]\n" + "inch x15\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ldr x27, [x8, #0x38]\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z5.h, z9.h\n" + "ldr x26, [x8, #0x40]\n" + "fmla z27.h, p3/M, z4.h, z9.h\n" + "ldr x22, [x8, #0x48]\n" + "fmla z26.h, p3/M, z3.h, z9.h\n" + "ldr x21, [x8, #0x50]\n" + "fmla z25.h, p3/M, z2.h, z9.h\n" + "ldr x20, [x8, #0x58]\n" + "fmla z24.h, p3/M, z1.h, z9.h\n" + "ldr x19, [x8, #0x60]\n" + "fmla z23.h, p3/M, z0.h, z9.h\n" + "ldr x25, [x8, #0x68]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x22, x17, LSL #1]\n" + "fmla z29.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x28, x17, LSL #1]\n" + "fmla z25.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x17, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z13.h\n" + "ldr x24, [x8, #0x70]\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "ldr x23, [x8, #0x78]\n" + "fmla z29.h, p3/M, z3.h, z13.h\n" + "ldr x14, [x8, #0x80]\n" + "fmla z28.h, p3/M, z2.h, z13.h\n" + "ldr x13, [x8, #0x88]\n" + "fmla z27.h, p3/M, z1.h, z13.h\n" + "ldr x12, [x8, #0x90]\n" + "fmla z26.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x27, x17, LSL #1]\n" + "fmla z23.h, p3/M, z8.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x26, x17, LSL #1]\n" + "fmla z31.h, p3/M, z7.h, z11.h\n" + "ldr x11, [x8, #0x98]\n" + "fmla z30.h, p3/M, z6.h, z11.h\n" + "ldr x10, [x8, #0xa0]\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "ldr x9, [x8, #0xa8]\n" + "fmla z27.h, p3/M, z3.h, z11.h\n" + "ldr x28, [x8, #0xb0]\n" + "fmla z25.h, p3/M, z1.h, z11.h\n" + "ldr x27, [x8, #0xb8]\n" + "fmla z24.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x21, x17, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z13.h\n" + "ldr x26, [x8, #0xc0]\n" + "fmla z30.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x20, x17, LSL #1]\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "ldr x22, [x6, #0x0]\n" + "fmla z27.h, p3/M, z5.h, z10.h\n" + "ldr x21, [x6, #0x8]\n" + "fmla z26.h, p3/M, z4.h, z10.h\n" + "ldr x20, [x6, #0x10]\n" + "fmla z30.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x19, x17, LSL #1]\n" + "fmla z29.h, p3/M, z7.h, z10.h\n" + "ldr x19, [x6, #0x18]\n" + "fmla z24.h, p3/M, z2.h, z10.h\n" + "ld1h { z16.h }, p3/Z, [x7]\n" // Load from weights and bias + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x25, x17, LSL #1]\n" + "fmla z31.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x17, LSL #1]\n" + "fmla z29.h, p3/M, z5.h, z13.h\n" + "fmla z26.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x23, x17, LSL #1]\n" + "fmla z25.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n" + "fmla z27.h, p3/M, z7.h, z10.h\n" + "fmla z26.h, p3/M, z6.h, z10.h\n" + "fmla z25.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z8.h, z10.h\n" + "fmla z24.h, p3/M, z4.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z26.h, p3/M, z8.h, z11.h\n" + "fmla z25.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z6.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x12, x17, LSL #1]\n" + "fmla z23.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n" + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ldp x14, x13, [x8, #0x0]\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "ld1h { z9.h }, p1/Z, [x14, x16, LSL #1]\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z27.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n" + "fmla z30.h, p3/M, z5.h, z11.h\n" + "ld1h { z10.h }, p1/Z, [x13, x16, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z11.h\n" + "ldp x12, x11, [x8, #0x10]\n" + "fmla z26.h, p3/M, z1.h, z11.h\n" + "fmla z27.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x17, LSL #1]\n" + "fmla z24.h, p3/M, z8.h, z13.h\n" + "ldr x10, [x8, #0x20]\n" + "fmla z23.h, p3/M, z7.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x9, x17, LSL #1]\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z27.h, p3/M, z6.h, z12.h\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n" + "fmla z31.h, p3/M, z2.h, z11.h\n" + "fmla z30.h, p3/M, z1.h, z11.h\n" + "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x17, LSL #1]\n" + "fmla z27.h, p3/M, z8.h, z13.h\n" + "fmla z26.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z5.h, z13.h\n" + "fmla z23.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n" + "inch x17\n" + "fmla z31.h, p3/M, z6.h, z12.h\n" + "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n" // Load from weights and bias + "whilelt p2.h, x17, %x[n_channels]\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n" // Load from weights and bias + "fmla z25.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x11, x16, LSL #1]\n" + "fmla z29.h, p3/M, z8.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n" // Load from weights and bias + "fmla z26.h, p3/M, z5.h, z11.h\n" + "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n" // Load from weights and bias + "fmla z23.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x12, x16, LSL #1]\n" + "fmla z25.h, p3/M, z8.h, z13.h\n" + "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n" // Load from weights and bias + "fmla z24.h, p3/M, z7.h, z13.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmla z23.h, p3/M, z6.h, z13.h\n" + "ld1h { z13.h }, p1/Z, [x10, x16, LSL #1]\n" + "inch x16\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n" // Load from weights and bias + "addvl x7, x7, #16\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n" // Load from weights and bias + "cmp x16, %x[n_channels]\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n" // Load from weights and bias + "addvl x7, x7, #-6\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "st1h { z31.h }, p0, [x22, x15, LSL #1]\n" + "mov z31.d, z16.d\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "ldr x22, [x6, #0x20]\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x21, x15, LSL #1]\n" + "mov z30.d, z16.d\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z29.h }, p0, [x20, x15, LSL #1]\n" + "mov z29.d, z16.d\n" + "ldr x21, [x6, #0x28]\n" + "fmax z27.h, p3/M, z27.h, z18.h\n" + "ldr x20, [x6, #0x30]\n" + "fmax z26.h, p3/M, z26.h, z18.h\n" + "st1h { z28.h }, p0, [x19, x15, LSL #1]\n" + "mov z28.d, z16.d\n" + "ldr x19, [x6, #0x38]\n" + "fmax z25.h, p3/M, z25.h, z18.h\n" + "fmin z27.h, p3/M, z27.h, z17.h\n" + "st1h { z27.h }, p0, [x22, x15, LSL #1]\n" + "mov z27.d, z16.d\n" + "fmin z26.h, p3/M, z26.h, z17.h\n" + "ldr x22, [x6, #0x40]\n" + "fmin z25.h, p3/M, z25.h, z17.h\n" + "st1h { z26.h }, p0, [x21, x15, LSL #1]\n" + "mov z26.d, z16.d\n" + "fmax z24.h, p3/M, z24.h, z18.h\n" + "st1h { z25.h }, p0, [x20, x15, LSL #1]\n" + "mov z25.d, z16.d\n" + "fmax z23.h, p3/M, z23.h, z18.h\n" + "fmin z24.h, p3/M, z24.h, z17.h\n" + "st1h { z24.h }, p0, [x19, x15, LSL #1]\n" + "mov z24.d, z16.d\n" + "fmin z23.h, p3/M, z23.h, z17.h\n" + "st1h { z23.h }, p0, [x22, x15, LSL #1]\n" + "mov z23.d, z16.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x9, [x8, #0x28]\n" + "inch x15\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "ldr x28, [x8, #0x30]\n" + "mov p0.b, p2.b\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ldr x27, [x8, #0x38]\n" + "fmla z28.h, p3/M, z5.h, z9.h\n" + "ldr x26, [x8, #0x40]\n" + "fmla z27.h, p3/M, z4.h, z9.h\n" + "ldr x22, [x8, #0x48]\n" + "fmla z26.h, p3/M, z3.h, z9.h\n" + "ldr x21, [x8, #0x50]\n" + "fmla z25.h, p3/M, z2.h, z9.h\n" + "ldr x20, [x8, #0x58]\n" + "fmla z24.h, p3/M, z1.h, z9.h\n" + "ldr x19, [x8, #0x60]\n" + "fmla z23.h, p3/M, z0.h, z9.h\n" + "ldr x25, [x8, #0x68]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x22, x17, LSL #1]\n" + "fmla z29.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x28, x17, LSL #1]\n" + "fmla z25.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x17, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z13.h\n" + "ldr x24, [x8, #0x70]\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "ldr x23, [x8, #0x78]\n" + "fmla z29.h, p3/M, z3.h, z13.h\n" + "ldr x14, [x8, #0x80]\n" + "fmla z28.h, p3/M, z2.h, z13.h\n" + "ldr x13, [x8, #0x88]\n" + "fmla z27.h, p3/M, z1.h, z13.h\n" + "ldr x12, [x8, #0x90]\n" + "fmla z26.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x27, x17, LSL #1]\n" + "fmla z23.h, p3/M, z8.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x26, x17, LSL #1]\n" + "fmla z31.h, p3/M, z7.h, z11.h\n" + "ldr x11, [x8, #0x98]\n" + "fmla z30.h, p3/M, z6.h, z11.h\n" + "ldr x10, [x8, #0xa0]\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "ldr x9, [x8, #0xa8]\n" + "fmla z27.h, p3/M, z3.h, z11.h\n" + "ldr x28, [x8, #0xb0]\n" + "fmla z25.h, p3/M, z1.h, z11.h\n" + "ldr x27, [x8, #0xb8]\n" + "fmla z24.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x21, x17, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z13.h\n" + "ldr x26, [x8, #0xc0]\n" + "fmla z30.h, p3/M, z0.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x20, x17, LSL #1]\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "ldr x22, [x6, #0x0]\n" + "fmla z27.h, p3/M, z5.h, z10.h\n" + "ldr x21, [x6, #0x8]\n" + "fmla z26.h, p3/M, z4.h, z10.h\n" + "ldr x20, [x6, #0x10]\n" + "fmla z30.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x19, x17, LSL #1]\n" + "fmla z29.h, p3/M, z7.h, z10.h\n" + "ldr x19, [x6, #0x18]\n" + "fmla z24.h, p3/M, z2.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z30.h, p3/M, z8.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x25, x17, LSL #1]\n" + "fmla z31.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x17, LSL #1]\n" + "fmla z29.h, p3/M, z5.h, z13.h\n" + "fmla z26.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x23, x17, LSL #1]\n" + "fmla z25.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x17, LSL #1]\n" + "fmla z27.h, p3/M, z7.h, z10.h\n" + "fmla z26.h, p3/M, z6.h, z10.h\n" + "fmla z25.h, p3/M, z5.h, z10.h\n" + "fmla z28.h, p3/M, z8.h, z10.h\n" + "fmla z24.h, p3/M, z4.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z26.h, p3/M, z8.h, z11.h\n" + "fmla z25.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z6.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x12, x17, LSL #1]\n" + "fmla z23.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x17, LSL #1]\n" + "fmla z31.h, p3/M, z4.h, z12.h\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z27.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x17, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z11.h\n" + "fmla z30.h, p3/M, z5.h, z11.h\n" + "fmla z26.h, p3/M, z1.h, z11.h\n" + "fmla z27.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x17, LSL #1]\n" + "fmla z24.h, p3/M, z8.h, z13.h\n" + "fmla z23.h, p3/M, z7.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x9, x17, LSL #1]\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z27.h, p3/M, z6.h, z12.h\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x28, x17, LSL #1]\n" + "fmla z31.h, p3/M, z2.h, z11.h\n" + "fmla z30.h, p3/M, z1.h, z11.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x17, LSL #1]\n" + "fmla z27.h, p3/M, z8.h, z13.h\n" + "fmla z26.h, p3/M, z7.h, z13.h\n" + "fmla z24.h, p3/M, z5.h, z13.h\n" + "fmla z23.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x26, x17, LSL #1]\n" + "fmla z31.h, p3/M, z6.h, z12.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z25.h, p3/M, z0.h, z12.h\n" + "fmla z29.h, p3/M, z8.h, z11.h\n" + "fmla z26.h, p3/M, z5.h, z11.h\n" + "fmla z23.h, p3/M, z2.h, z11.h\n" + "fmla z25.h, p3/M, z8.h, z13.h\n" + "fmla z24.h, p3/M, z7.h, z13.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmla z23.h, p3/M, z6.h, z13.h\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x22, x15, LSL #1]\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "ldr x22, [x6, #0x20]\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "st1h { z30.h }, p0, [x21, x15, LSL #1]\n" + "fmax z27.h, p3/M, z27.h, z18.h\n" + "fmax z26.h, p3/M, z26.h, z18.h\n" + "st1h { z29.h }, p0, [x20, x15, LSL #1]\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "ldr x21, [x6, #0x28]\n" + "fmax z25.h, p3/M, z25.h, z18.h\n" + "ldr x20, [x6, #0x30]\n" + "fmax z24.h, p3/M, z24.h, z18.h\n" + "st1h { z28.h }, p0, [x19, x15, LSL #1]\n" + "fmin z27.h, p3/M, z27.h, z17.h\n" + "fmin z26.h, p3/M, z26.h, z17.h\n" + "ldr x19, [x6, #0x38]\n" + "fmin z25.h, p3/M, z25.h, z17.h\n" + "st1h { z27.h }, p0, [x22, x15, LSL #1]\n" + "fmin z24.h, p3/M, z24.h, z17.h\n" + "fmax z23.h, p3/M, z23.h, z18.h\n" + "st1h { z26.h }, p0, [x21, x15, LSL #1]\n" + "st1h { z25.h }, p0, [x20, x15, LSL #1]\n" + "fmin z23.h, p3/M, z23.h, z17.h\n" + "st1h { z24.h }, p0, [x19, x15, LSL #1]\n" + "ldr x22, [x6, #0x40]\n" + "st1h { z23.h }, p0, [x22, x15, LSL #1]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp new file mode 100644 index 0000000000..f976842b7a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + + sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..8f0fce7e96 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x2, #0x0\n" + "mov x3, #0x0\n" + "1:" // Tile loop + "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x24, #0x4\n" + "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x23, #0x4\n" + "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x5, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cnth x6\n" + "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x21, XZR, x6\n" + "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x2, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col + "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x24\n" // offset *= kernel_stride * output_size + "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x15, x8, x22, LSL #1\n" + "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x14, x15, x22, LSL #1\n" + "ld1h { z13.h }, p3/Z, [x4]\n" // Load from weights and bias + "mov z31.d, z13.d\n" + "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias + "add x13, x14, x22, LSL #1\n" + "mov z30.d, z13.d\n" + "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias + "add x12, x13, x22, LSL #1\n" + "mov z29.d, z13.d\n" + "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias + "add x11, x12, x22, LSL #1\n" + "mov z28.d, z13.d\n" + "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias + "add x10, x7, x7\n" + "mov z27.d, z13.d\n" + "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias + "add x9, x10, x7\n" + "mov z26.d, z13.d\n" + "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias + "add x28, x9, x7\n" + "mov z25.d, z13.d\n" + "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias + "add x27, x28, x7\n" + "mov z24.d, z13.d\n" + "mul x19, x2, x20\n" // offset = tile_i * ld_output_row + "mov z23.d, z13.d\n" + "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col + "mov z22.d, z13.d\n" + "mul x19, x19, x23\n" // offset *= output_tile_size + "mov z21.d, z13.d\n" + "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "mov z20.d, z13.d\n" + "add x26, x16, x20, LSL #1\n" + "mov z19.d, z13.d\n" + "add x25, x26, x20, LSL #1\n" + "mov z18.d, z13.d\n" + "add x24, x25, x20, LSL #1\n" + "mov z17.d, z13.d\n" + "add x23, x17, x17\n" + "mov z16.d, z13.d\n" + "add x22, x23, x17\n" + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z9.h }, p2/Z, [x14, x10, LSL #1]\n" // Load input point (2, 2) + "ld1h { z10.h }, p2/Z, [x8]\n" // Load input point (0, 0) + "addvl x4, x4, #16\n" + "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n" // Load input point (0, 5) + "cmp x6, %x[n_channels]\n" + "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias + "addvl x4, x4, #-6\n" + "ld1h { z12.h }, p2/Z, [x14, x9, LSL #1]\n" // Load input point (2, 3) + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ld1h { z13.h }, p3/Z, [x4]\n" // Load from weights and bias + "whilelt p1.h, x6, %x[n_channels]\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "inch x21\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "mov p0.b, p2.b\n" + "fmla z27.h, p3/M, z5.h, z9.h\n" + "inch x5\n" + "fmla z26.h, p3/M, z4.h, z9.h\n" + "inch x6\n" + "fmla z25.h, p3/M, z3.h, z9.h\n" + "fmla z23.h, p3/M, z2.h, z9.h\n" + "fmla z22.h, p3/M, z1.h, z9.h\n" + "fmla z21.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n" // Load input point (3, 2) + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x11]\n" // Load input point (5, 0) + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (5, 5) + "fmla z30.h, p3/M, z8.h, z12.h\n" + "fmla z29.h, p3/M, z7.h, z12.h\n" + "fmla z26.h, p3/M, z5.h, z12.h\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "fmla z22.h, p3/M, z2.h, z12.h\n" + "fmla z21.h, p3/M, z1.h, z12.h\n" + "fmla z20.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n" // Load input point (0, 1) + "fmla z19.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n" // Load input point (3, 3) + "fmla z16.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n" // Load input point (0, 4) + "fmla z27.h, p3/M, z8.h, z9.h\n" + "fmla z26.h, p3/M, z7.h, z9.h\n" + "fmla z25.h, p3/M, z6.h, z9.h\n" + "fmla z23.h, p3/M, z5.h, z9.h\n" + "fmla z22.h, p3/M, z4.h, z9.h\n" + "fmla z21.h, p3/M, z3.h, z9.h\n" + "fmla z19.h, p3/M, z2.h, z9.h\n" + "fmla z18.h, p3/M, z1.h, z9.h\n" + "fmla z17.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x15]\n" // Load input point (1, 0) + "fmla z31.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n" // Load input point (1, 5) + "fmla z29.h, p3/M, z2.h, z11.h\n" + "fmla z28.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12]\n" // Load input point (4, 0) + "fmla z26.h, p3/M, z8.h, z10.h\n" + "fmla z25.h, p3/M, z7.h, z10.h\n" + "fmla z24.h, p3/M, z6.h, z10.h\n" + "fmla z22.h, p3/M, z5.h, z10.h\n" + "fmla z21.h, p3/M, z4.h, z10.h\n" + "fmla z20.h, p3/M, z3.h, z10.h\n" + "fmla z18.h, p3/M, z2.h, z10.h\n" + "fmla z17.h, p3/M, z1.h, z10.h\n" + "fmla z16.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n" // Load input point (1, 2) + "fmla z31.h, p3/M, z3.h, z9.h\n" + "fmla z27.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n" // Load input point (1, 3) + "fmla z23.h, p3/M, z6.h, z11.h\n" + "fmla z19.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n" // Load input point (4, 5) + "fmla z31.h, p3/M, z5.h, z10.h\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z2.h, z10.h\n" + "fmla z26.h, p3/M, z1.h, z10.h\n" + "fmla z25.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n" // Load input point (2, 1) + "fmla z20.h, p3/M, z8.h, z11.h\n" + "fmla z16.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" // Load input point (5, 1) + "fmla z30.h, p3/M, z5.h, z12.h\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z26.h, p3/M, z2.h, z12.h\n" + "fmla z25.h, p3/M, z1.h, z12.h\n" + "fmla z24.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n" // Load input point (2, 4) + "fmla z19.h, p3/M, z7.h, z11.h\n" + "fmla z18.h, p3/M, z6.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n" // Load input point (5, 4) + "fmla z31.h, p3/M, z7.h, z10.h\n" + "fmla z30.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z4.h, z10.h\n" + "fmla z26.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z22.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n" // Load input point (0, 2) + "fmla z17.h, p3/M, z8.h, z11.h\n" + "fmla z16.h, p3/M, z7.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n" // Load input point (3, 1) + "fmla z29.h, p3/M, z8.h, z12.h\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z25.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z4.h, z12.h\n" + "fmla z21.h, p3/M, z2.h, z12.h\n" + "fmla z20.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n" // Load input point (0, 3) + "addvl x8, x8, #1\n" + "fmla z31.h, p3/M, z2.h, z10.h\n" + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (2, 0) + "fmla z27.h, p3/M, z7.h, z11.h\n" + "fmla z26.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z4.h, z11.h\n" + "fmla z22.h, p3/M, z3.h, z11.h\n" + "fmla z19.h, p3/M, z1.h, z11.h\n" + "fmla z18.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n" // Load input point (3, 4) + "fmla z30.h, p3/M, z2.h, z12.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "fmla z28.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (2, 5) + "addvl x14, x14, #1\n" + "fmla z31.h, p3/M, z6.h, z10.h\n" + "ld1h { z9.h }, p1/Z, [x14, x10, LSL #1]\n" // Load input point (2, 2) + "fmla z27.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x13]\n" // Load input point (3, 0) + "fmla z25.h, p3/M, z8.h, z11.h\n" + "fmla z24.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z5.h, z11.h\n" + "fmla z20.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z2.h, z11.h\n" + "fmla z16.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n" // Load input point (4, 2) + "fmla z28.h, p3/M, z8.h, z12.h\n" + "fmla z24.h, p3/M, z5.h, z12.h\n" + "fmla z20.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n" // Load input point (3, 5) + "addvl x13, x13, #1\n" + "fmla z27.h, p3/M, z6.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z19.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n" // Load input point (5, 2) + "fmla z22.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z8.h, z11.h\n" + "fmla z19.h, p3/M, z5.h, z11.h\n" + "fmla z18.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n" // Load input point (4, 3) + "fmla z24.h, p3/M, z8.h, z12.h\n" + "fmla z20.h, p3/M, z5.h, z12.h\n" + "fmla z16.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n" // Load input point (5, 3) + "addvl x11, x11, #1\n" + "fmla z19.h, p3/M, z8.h, z10.h\n" + "fmla z18.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n" // Load input point (1, 1) + "fmla z22.h, p3/M, z8.h, z11.h\n" + "fmla z21.h, p3/M, z7.h, z11.h\n" + "fmla z20.h, p3/M, z6.h, z11.h\n" + "fmla z18.h, p3/M, z5.h, z11.h\n" + "fmla z17.h, p3/M, z4.h, z11.h\n" + "fmla z16.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n" // Load input point (1, 4) + "addvl x15, x15, #1\n" + "fmla z18.h, p3/M, z8.h, z12.h\n" + "fmla z31.h, p3/M, z4.h, z10.h\n" + "fmla z17.h, p3/M, z7.h, z12.h\n" + "fmla z16.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n" // Load input point (4, 1) + "fmla z30.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z1.h, z10.h\n" + "fmla z26.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n" // Load input point (4, 4) + "whilelt p2.h, x5, %x[n_channels]\n" + "fmla z29.h, p3/M, z5.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x4, #1, MUL VL]\n" // Load from weights and bias + "addvl x12, x12, #1\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "cmp x6, %x[n_channels]\n" + "fmla z25.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x4, #3, MUL VL]\n" // Load from weights and bias + "fmla z24.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n" // Load input point (0, 5) + "fmla z23.h, p3/M, z7.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x4, #2, MUL VL]\n" // Load from weights and bias + "fmla z22.h, p3/M, z6.h, z12.h\n" + "ld1h { z6.h }, p3/Z, [x4, #7, MUL VL]\n" // Load from weights and bias + "fmla z19.h, p3/M, z4.h, z12.h\n" + "fmla z18.h, p3/M, z3.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x14, x9, LSL #1]\n" // Load input point (2, 3) + "fmla z21.h, p3/M, z8.h, z10.h\n" + "ld1h { z3.h }, p3/Z, [x4, #4, MUL VL]\n" // Load from weights and bias + "fmla z20.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z5.h, z10.h\n" + "ld1h { z5.h }, p3/Z, [x4, #6, MUL VL]\n" // Load from weights and bias + "fmla z16.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p1/Z, [x8]\n" // Load input point (0, 0) + "fmax z31.h, p3/M, z31.h, z15.h\n" + "ld1h { z4.h }, p3/Z, [x4, #5, MUL VL]\n" // Load from weights and bias + "addvl x4, x4, #16\n" + "fmax z30.h, p3/M, z30.h, z15.h\n" + "ld1h { z7.h }, p3/Z, [x4, #-8, MUL VL]\n" // Load from weights and bias + "fmax z29.h, p3/M, z29.h, z15.h\n" + "ld1h { z8.h }, p3/Z, [x4, #-7, MUL VL]\n" // Load from weights and bias + "addvl x4, x4, #-6\n" + "fmin z31.h, p3/M, z31.h, z14.h\n" + "st1h { z31.h }, p0, [x16]\n" // Store output point (0, 0) + "mov z31.d, z13.d\n" + "fmin z30.h, p3/M, z30.h, z14.h\n" + "st1h { z30.h }, p0, [x16, x17, LSL #1]\n" // Store output point (0, 1) + "mov z30.d, z13.d\n" + "fmin z29.h, p3/M, z29.h, z14.h\n" + "st1h { z29.h }, p0, [x16, x23, LSL #1]\n" // Store output point (0, 2) + "mov z29.d, z13.d\n" + "fmax z28.h, p3/M, z28.h, z15.h\n" + "fmax z27.h, p3/M, z27.h, z15.h\n" + "fmax z26.h, p3/M, z26.h, z15.h\n" + "fmax z25.h, p3/M, z25.h, z15.h\n" + "fmin z28.h, p3/M, z28.h, z14.h\n" + "st1h { z28.h }, p0, [x16, x22, LSL #1]\n" // Store output point (0, 3) + "mov z28.d, z13.d\n" + "addvl x16, x16, #1\n" + "fmin z27.h, p3/M, z27.h, z14.h\n" + "st1h { z27.h }, p0, [x26]\n" // Store output point (1, 0) + "mov z27.d, z13.d\n" + "fmin z26.h, p3/M, z26.h, z14.h\n" + "st1h { z26.h }, p0, [x26, x17, LSL #1]\n" // Store output point (1, 1) + "mov z26.d, z13.d\n" + "fmin z25.h, p3/M, z25.h, z14.h\n" + "st1h { z25.h }, p0, [x26, x23, LSL #1]\n" // Store output point (1, 2) + "mov z25.d, z13.d\n" + "fmax z24.h, p3/M, z24.h, z15.h\n" + "fmax z23.h, p3/M, z23.h, z15.h\n" + "fmax z22.h, p3/M, z22.h, z15.h\n" + "fmax z21.h, p3/M, z21.h, z15.h\n" + "fmin z24.h, p3/M, z24.h, z14.h\n" + "st1h { z24.h }, p0, [x26, x22, LSL #1]\n" // Store output point (1, 3) + "mov z24.d, z13.d\n" + "addvl x26, x26, #1\n" + "fmin z23.h, p3/M, z23.h, z14.h\n" + "st1h { z23.h }, p0, [x25]\n" // Store output point (2, 0) + "mov z23.d, z13.d\n" + "fmin z22.h, p3/M, z22.h, z14.h\n" + "st1h { z22.h }, p0, [x25, x17, LSL #1]\n" // Store output point (2, 1) + "mov z22.d, z13.d\n" + "fmin z21.h, p3/M, z21.h, z14.h\n" + "st1h { z21.h }, p0, [x25, x23, LSL #1]\n" // Store output point (2, 2) + "mov z21.d, z13.d\n" + "fmax z20.h, p3/M, z20.h, z15.h\n" + "fmax z19.h, p3/M, z19.h, z15.h\n" + "fmax z18.h, p3/M, z18.h, z15.h\n" + "fmax z17.h, p3/M, z17.h, z15.h\n" + "fmin z20.h, p3/M, z20.h, z14.h\n" + "st1h { z20.h }, p0, [x25, x22, LSL #1]\n" // Store output point (2, 3) + "mov z20.d, z13.d\n" + "addvl x25, x25, #1\n" + "fmin z19.h, p3/M, z19.h, z14.h\n" + "st1h { z19.h }, p0, [x24]\n" // Store output point (3, 0) + "mov z19.d, z13.d\n" + "fmin z18.h, p3/M, z18.h, z14.h\n" + "st1h { z18.h }, p0, [x24, x17, LSL #1]\n" // Store output point (3, 1) + "mov z18.d, z13.d\n" + "fmin z17.h, p3/M, z17.h, z14.h\n" + "st1h { z17.h }, p0, [x24, x23, LSL #1]\n" // Store output point (3, 2) + "mov z17.d, z13.d\n" + "fmax z16.h, p3/M, z16.h, z15.h\n" + "fmin z16.h, p3/M, z16.h, z14.h\n" + "st1h { z16.h }, p0, [x24, x22, LSL #1]\n" // Store output point (3, 3) + "mov z16.d, z13.d\n" + "addvl x24, x24, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov p0.b, p2.b\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x21, x2, #0x1\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z27.h, p3/M, z5.h, z9.h\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "add x3, x3, #0x1\n" + "fmla z26.h, p3/M, z4.h, z9.h\n" + "cmp x3, x19\n" + "fmla z25.h, p3/M, z3.h, z9.h\n" + "fmla z23.h, p3/M, z2.h, z9.h\n" + "csel x3, x3, XZR, LT\n" + "fmla z22.h, p3/M, z1.h, z9.h\n" + "csel x2, x2, x21, LT\n" + "fmla z21.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x13, x10, LSL #1]\n" // Load input point (3, 2) + "cmp x2, x20\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x11]\n" // Load input point (5, 0) + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n" // Load input point (5, 5) + "fmla z30.h, p3/M, z8.h, z12.h\n" + "fmla z29.h, p3/M, z7.h, z12.h\n" + "fmla z26.h, p3/M, z5.h, z12.h\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "fmla z22.h, p3/M, z2.h, z12.h\n" + "fmla z21.h, p3/M, z1.h, z12.h\n" + "fmla z20.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x8, x7, LSL #1]\n" // Load input point (0, 1) + "fmla z19.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x13, x9, LSL #1]\n" // Load input point (3, 3) + "fmla z16.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x8, x28, LSL #1]\n" // Load input point (0, 4) + "fmla z27.h, p3/M, z8.h, z9.h\n" + "fmla z26.h, p3/M, z7.h, z9.h\n" + "fmla z25.h, p3/M, z6.h, z9.h\n" + "fmla z23.h, p3/M, z5.h, z9.h\n" + "fmla z22.h, p3/M, z4.h, z9.h\n" + "fmla z21.h, p3/M, z3.h, z9.h\n" + "fmla z19.h, p3/M, z2.h, z9.h\n" + "fmla z18.h, p3/M, z1.h, z9.h\n" + "fmla z17.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x15]\n" // Load input point (1, 0) + "fmla z31.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x15, x27, LSL #1]\n" // Load input point (1, 5) + "fmla z29.h, p3/M, z2.h, z11.h\n" + "fmla z28.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12]\n" // Load input point (4, 0) + "fmla z26.h, p3/M, z8.h, z10.h\n" + "fmla z25.h, p3/M, z7.h, z10.h\n" + "fmla z24.h, p3/M, z6.h, z10.h\n" + "fmla z22.h, p3/M, z5.h, z10.h\n" + "fmla z21.h, p3/M, z4.h, z10.h\n" + "fmla z20.h, p3/M, z3.h, z10.h\n" + "fmla z18.h, p3/M, z2.h, z10.h\n" + "fmla z17.h, p3/M, z1.h, z10.h\n" + "fmla z16.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x15, x10, LSL #1]\n" // Load input point (1, 2) + "fmla z31.h, p3/M, z3.h, z9.h\n" + "fmla z27.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x15, x9, LSL #1]\n" // Load input point (1, 3) + "fmla z23.h, p3/M, z6.h, z11.h\n" + "fmla z19.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n" // Load input point (4, 5) + "fmla z31.h, p3/M, z5.h, z10.h\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z2.h, z10.h\n" + "fmla z26.h, p3/M, z1.h, z10.h\n" + "fmla z25.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n" // Load input point (2, 1) + "fmla z20.h, p3/M, z8.h, z11.h\n" + "fmla z16.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" // Load input point (5, 1) + "fmla z30.h, p3/M, z5.h, z12.h\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z26.h, p3/M, z2.h, z12.h\n" + "fmla z25.h, p3/M, z1.h, z12.h\n" + "fmla z24.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x28, LSL #1]\n" // Load input point (2, 4) + "fmla z19.h, p3/M, z7.h, z11.h\n" + "fmla z18.h, p3/M, z6.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x28, LSL #1]\n" // Load input point (5, 4) + "fmla z31.h, p3/M, z7.h, z10.h\n" + "fmla z30.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z4.h, z10.h\n" + "fmla z26.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z22.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x8, x10, LSL #1]\n" // Load input point (0, 2) + "fmla z17.h, p3/M, z8.h, z11.h\n" + "fmla z16.h, p3/M, z7.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x7, LSL #1]\n" // Load input point (3, 1) + "fmla z29.h, p3/M, z8.h, z12.h\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z25.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z4.h, z12.h\n" + "fmla z21.h, p3/M, z2.h, z12.h\n" + "fmla z20.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x8, x9, LSL #1]\n" // Load input point (0, 3) + "fmla z31.h, p3/M, z2.h, z10.h\n" + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14]\n" // Load input point (2, 0) + "fmla z27.h, p3/M, z7.h, z11.h\n" + "fmla z26.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z4.h, z11.h\n" + "fmla z22.h, p3/M, z3.h, z11.h\n" + "fmla z19.h, p3/M, z1.h, z11.h\n" + "fmla z18.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x28, LSL #1]\n" // Load input point (3, 4) + "fmla z30.h, p3/M, z2.h, z12.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "fmla z28.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x14, x27, LSL #1]\n" // Load input point (2, 5) + "fmla z31.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x13]\n" // Load input point (3, 0) + "fmla z25.h, p3/M, z8.h, z11.h\n" + "fmla z24.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z5.h, z11.h\n" + "fmla z20.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z2.h, z11.h\n" + "fmla z16.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12, x10, LSL #1]\n" // Load input point (4, 2) + "fmla z28.h, p3/M, z8.h, z12.h\n" + "fmla z24.h, p3/M, z5.h, z12.h\n" + "fmla z20.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n" // Load input point (3, 5) + "fmla z27.h, p3/M, z6.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z19.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x11, x10, LSL #1]\n" // Load input point (5, 2) + "fmla z22.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z8.h, z11.h\n" + "fmla z19.h, p3/M, z5.h, z11.h\n" + "fmla z18.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x12, x9, LSL #1]\n" // Load input point (4, 3) + "fmla z24.h, p3/M, z8.h, z12.h\n" + "fmla z20.h, p3/M, z5.h, z12.h\n" + "fmla z16.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x9, LSL #1]\n" // Load input point (5, 3) + "fmla z19.h, p3/M, z8.h, z10.h\n" + "fmla z18.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x15, x7, LSL #1]\n" // Load input point (1, 1) + "fmla z22.h, p3/M, z8.h, z11.h\n" + "fmla z21.h, p3/M, z7.h, z11.h\n" + "fmla z20.h, p3/M, z6.h, z11.h\n" + "fmla z18.h, p3/M, z5.h, z11.h\n" + "fmla z17.h, p3/M, z4.h, z11.h\n" + "fmla z16.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x15, x28, LSL #1]\n" // Load input point (1, 4) + "fmla z31.h, p3/M, z4.h, z10.h\n" + "fmla z18.h, p3/M, z8.h, z12.h\n" + "fmla z17.h, p3/M, z7.h, z12.h\n" + "fmla z16.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x12, x7, LSL #1]\n" // Load input point (4, 1) + "fmla z30.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z1.h, z10.h\n" + "fmla z26.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x12, x28, LSL #1]\n" // Load input point (4, 4) + "fmla z29.h, p3/M, z5.h, z11.h\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "fmla z25.h, p3/M, z2.h, z11.h\n" + "fmla z24.h, p3/M, z1.h, z11.h\n" + "fmla z23.h, p3/M, z7.h, z12.h\n" + "fmla z22.h, p3/M, z6.h, z12.h\n" + "fmla z19.h, p3/M, z4.h, z12.h\n" + "fmla z18.h, p3/M, z3.h, z12.h\n" + "fmla z21.h, p3/M, z8.h, z10.h\n" + "fmla z20.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z5.h, z10.h\n" + "fmla z16.h, p3/M, z4.h, z10.h\n" + "fmax z31.h, p3/M, z31.h, z15.h\n" + "fmax z30.h, p3/M, z30.h, z15.h\n" + "fmax z29.h, p3/M, z29.h, z15.h\n" + "fmax z28.h, p3/M, z28.h, z15.h\n" + "fmin z31.h, p3/M, z31.h, z14.h\n" + "st1h { z31.h }, p0, [x16]\n" // Store output point (0, 0) + "fmin z30.h, p3/M, z30.h, z14.h\n" + "fmin z29.h, p3/M, z29.h, z14.h\n" + "st1h { z30.h }, p0, [x16, x17, LSL #1]\n" // Store output point (0, 1) + "fmin z28.h, p3/M, z28.h, z14.h\n" + "fmax z27.h, p3/M, z27.h, z15.h\n" + "st1h { z29.h }, p0, [x16, x23, LSL #1]\n" // Store output point (0, 2) + "fmax z26.h, p3/M, z26.h, z15.h\n" + "st1h { z28.h }, p0, [x16, x22, LSL #1]\n" // Store output point (0, 3) + "fmin z27.h, p3/M, z27.h, z14.h\n" + "fmax z25.h, p3/M, z25.h, z15.h\n" + "st1h { z27.h }, p0, [x26]\n" // Store output point (1, 0) + "fmin z26.h, p3/M, z26.h, z14.h\n" + "fmin z25.h, p3/M, z25.h, z14.h\n" + "st1h { z26.h }, p0, [x26, x17, LSL #1]\n" // Store output point (1, 1) + "fmax z24.h, p3/M, z24.h, z15.h\n" + "fmax z23.h, p3/M, z23.h, z15.h\n" + "st1h { z25.h }, p0, [x26, x23, LSL #1]\n" // Store output point (1, 2) + "fmax z22.h, p3/M, z22.h, z15.h\n" + "fmax z21.h, p3/M, z21.h, z15.h\n" + "fmax z20.h, p3/M, z20.h, z15.h\n" + "fmin z24.h, p3/M, z24.h, z14.h\n" + "st1h { z24.h }, p0, [x26, x22, LSL #1]\n" // Store output point (1, 3) + "fmin z23.h, p3/M, z23.h, z14.h\n" + "fmin z22.h, p3/M, z22.h, z14.h\n" + "st1h { z23.h }, p0, [x25]\n" // Store output point (2, 0) + "fmin z21.h, p3/M, z21.h, z14.h\n" + "fmin z20.h, p3/M, z20.h, z14.h\n" + "st1h { z22.h }, p0, [x25, x17, LSL #1]\n" // Store output point (2, 1) + "fmax z19.h, p3/M, z19.h, z15.h\n" + "st1h { z21.h }, p0, [x25, x23, LSL #1]\n" // Store output point (2, 2) + "fmax z18.h, p3/M, z18.h, z15.h\n" + "fmax z17.h, p3/M, z17.h, z15.h\n" + "st1h { z20.h }, p0, [x25, x22, LSL #1]\n" // Store output point (2, 3) + "fmin z19.h, p3/M, z19.h, z14.h\n" + "st1h { z19.h }, p0, [x24]\n" // Store output point (3, 0) + "fmin z18.h, p3/M, z18.h, z14.h\n" + "fmin z17.h, p3/M, z17.h, z14.h\n" + "st1h { z18.h }, p0, [x24, x17, LSL #1]\n" // Store output point (3, 1) + "fmax z16.h, p3/M, z16.h, z15.h\n" + "st1h { z17.h }, p0, [x24, x23, LSL #1]\n" // Store output point (3, 2) + "fmin z16.h, p3/M, z16.h, z14.h\n" + "st1h { z16.h }, p0, [x24, x22, LSL #1]\n" // Store output point (3, 3) + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..8148353f1a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,746 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[36]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[14]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[5]; + inptrs[3] = input_ptrs[15]; + inptrs[4] = input_ptrs[30]; + inptrs[5] = input_ptrs[35]; + inptrs[6] = input_ptrs[20]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[4]; + inptrs[9] = input_ptrs[21]; + inptrs[10] = input_ptrs[6]; + inptrs[11] = input_ptrs[11]; + inptrs[12] = input_ptrs[24]; + inptrs[13] = input_ptrs[8]; + inptrs[14] = input_ptrs[29]; + inptrs[15] = input_ptrs[9]; + inptrs[16] = input_ptrs[31]; + inptrs[17] = input_ptrs[13]; + inptrs[18] = input_ptrs[34]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[19]; + inptrs[22] = input_ptrs[3]; + inptrs[23] = input_ptrs[12]; + inptrs[24] = input_ptrs[22]; + inptrs[25] = input_ptrs[17]; + inptrs[26] = input_ptrs[18]; + inptrs[27] = input_ptrs[26]; + inptrs[28] = input_ptrs[23]; + inptrs[29] = input_ptrs[32]; + inptrs[30] = input_ptrs[27]; + inptrs[31] = input_ptrs[33]; + inptrs[32] = input_ptrs[7]; + inptrs[33] = input_ptrs[10]; + inptrs[34] = input_ptrs[25]; + inptrs[35] = input_ptrs[28]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x2, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n" + "add x4, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "mov x5, #0x0\n" + "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "cnth x6\n" + "ld1h { z13.h }, p3/Z, [x3]\n" // Load from weights and bias + "mov z31.d, z13.d\n" + "ld1h { z0.h }, p3/Z, [x3, #1, MUL VL]\n" // Load from weights and bias + "sub x7, XZR, x6\n" + "mov z30.d, z13.d\n" + "ld1h { z1.h }, p3/Z, [x3, #2, MUL VL]\n" // Load from weights and bias + "whilelt p2.h, XZR, %x[n_channels]\n" + "mov z29.d, z13.d\n" + "ld1h { z2.h }, p3/Z, [x3, #3, MUL VL]\n" // Load from weights and bias + "cmp x6, %x[n_channels]\n" + "mov z28.d, z13.d\n" + "ld1h { z3.h }, p3/Z, [x3, #4, MUL VL]\n" // Load from weights and bias + "mov z27.d, z13.d\n" + "ld1h { z4.h }, p3/Z, [x3, #5, MUL VL]\n" // Load from weights and bias + "mov z26.d, z13.d\n" + "ld1h { z5.h }, p3/Z, [x3, #6, MUL VL]\n" // Load from weights and bias + "mov z25.d, z13.d\n" + "ld1h { z6.h }, p3/Z, [x3, #7, MUL VL]\n" // Load from weights and bias + "addvl x3, x3, #16\n" + "mov z24.d, z13.d\n" + "ld1h { z7.h }, p3/Z, [x3, #-8, MUL VL]\n" // Load from weights and bias + "mov z23.d, z13.d\n" + "ld1h { z8.h }, p3/Z, [x3, #-7, MUL VL]\n" // Load from weights and bias + "addvl x3, x3, #-6\n" + "mov z22.d, z13.d\n" + "ldp x8, x17, [x4, #0x0]\n" + "mov z21.d, z13.d\n" + "ldp x16, x15, [x4, #0x10]\n" + "mov z20.d, z13.d\n" + "ld1h { z9.h }, p2/Z, [x8, x5, LSL #1]\n" + "mov z19.d, z13.d\n" + "mov z18.d, z13.d\n" + "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n" + "mov z17.d, z13.d\n" + "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n" + "mov z16.d, z13.d\n" + "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x14, [x4, #0x20]\n" + "whilelt p1.h, x6, %x[n_channels]\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "ldr x13, [x4, #0x28]\n" + "inch x7\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ldr x12, [x4, #0x30]\n" + "mov p0.b, p2.b\n" + "fmla z27.h, p3/M, z5.h, z9.h\n" + "ldr x11, [x4, #0x38]\n" + "fmla z26.h, p3/M, z4.h, z9.h\n" + "ldr x10, [x4, #0x40]\n" + "fmla z25.h, p3/M, z3.h, z9.h\n" + "ldr x9, [x4, #0x48]\n" + "fmla z23.h, p3/M, z2.h, z9.h\n" + "ldr x28, [x4, #0x50]\n" + "fmla z22.h, p3/M, z1.h, z9.h\n" + "ldr x27, [x4, #0x58]\n" + "fmla z21.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x12, x5, LSL #1]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n" + "fmla z30.h, p3/M, z8.h, z12.h\n" + "ldr x26, [x4, #0x60]\n" + "fmla z29.h, p3/M, z7.h, z12.h\n" + "ldr x25, [x4, #0x68]\n" + "fmla z26.h, p3/M, z5.h, z12.h\n" + "ldr x24, [x4, #0x70]\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "ldr x23, [x4, #0x78]\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "ldr x8, [x4, #0x80]\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "ldr x17, [x4, #0x88]\n" + "fmla z22.h, p3/M, z2.h, z12.h\n" + "ldr x16, [x4, #0x90]\n" + "fmla z21.h, p3/M, z1.h, z12.h\n" + "ldr x15, [x4, #0x98]\n" + "fmla z20.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n" + "fmla z19.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x9, x5, LSL #1]\n" + "fmla z16.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n" + "fmla z27.h, p3/M, z8.h, z9.h\n" + "ldr x14, [x4, #0xa0]\n" + "fmla z26.h, p3/M, z7.h, z9.h\n" + "ldr x13, [x4, #0xa8]\n" + "fmla z25.h, p3/M, z6.h, z9.h\n" + "ldr x12, [x4, #0xb0]\n" + "fmla z23.h, p3/M, z5.h, z9.h\n" + "ldr x11, [x4, #0xb8]\n" + "fmla z22.h, p3/M, z4.h, z9.h\n" + "ldr x10, [x4, #0xc0]\n" + "fmla z21.h, p3/M, z3.h, z9.h\n" + "ldr x9, [x4, #0xc8]\n" + "fmla z19.h, p3/M, z2.h, z9.h\n" + "ldr x22, [x2, #0x0]\n" + "fmla z18.h, p3/M, z1.h, z9.h\n" + "ldr x21, [x2, #0x8]\n" + "fmla z17.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x28, x5, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z12.h\n" + "ldr x28, [x4, #0xd0]\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x5, LSL #1]\n" + "fmla z29.h, p3/M, z2.h, z11.h\n" + "ldr x27, [x4, #0xd8]\n" + "fmla z28.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x26, x5, LSL #1]\n" + "fmla z26.h, p3/M, z8.h, z10.h\n" + "ldr x26, [x4, #0xe0]\n" + "fmla z25.h, p3/M, z7.h, z10.h\n" + "ldr x20, [x2, #0x10]\n" + "fmla z24.h, p3/M, z6.h, z10.h\n" + "ldr x19, [x2, #0x18]\n" + "fmla z22.h, p3/M, z5.h, z10.h\n" + "ld1h { z13.h }, p3/Z, [x3]\n" // Load from weights and bias + "fmla z21.h, p3/M, z4.h, z10.h\n" + "fmla z20.h, p3/M, z3.h, z10.h\n" + "fmla z18.h, p3/M, z2.h, z10.h\n" + "fmla z17.h, p3/M, z1.h, z10.h\n" + "fmla z16.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n" + "fmla z31.h, p3/M, z3.h, z9.h\n" + "ldr x25, [x4, #0xe8]\n" + "fmla z27.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n" + "fmla z23.h, p3/M, z6.h, z11.h\n" + "ldr x23, [x4, #0xf8]\n" + "fmla z19.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n" + "fmla z31.h, p3/M, z5.h, z10.h\n" + "ldr x24, [x4, #0xf0]\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z2.h, z10.h\n" + "fmla z26.h, p3/M, z1.h, z10.h\n" + "fmla z25.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n" + "fmla z20.h, p3/M, z8.h, z11.h\n" + "ldr x17, [x4, #0x108]\n" + "fmla z16.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x8, x5, LSL #1]\n" + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ldr x8, [x4, #0x100]\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z26.h, p3/M, z2.h, z12.h\n" + "fmla z25.h, p3/M, z1.h, z12.h\n" + "fmla z24.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n" + "fmla z19.h, p3/M, z7.h, z11.h\n" + "ldr x15, [x4, #0x118]\n" + "fmla z18.h, p3/M, z6.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n" + "fmla z31.h, p3/M, z7.h, z10.h\n" + "ldr x16, [x4, #0x110]\n" + "fmla z30.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z4.h, z10.h\n" + "fmla z26.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z22.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n" + "fmla z17.h, p3/M, z8.h, z11.h\n" + "fmla z16.h, p3/M, z7.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n" + "fmla z29.h, p3/M, z8.h, z12.h\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z25.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z4.h, z12.h\n" + "fmla z21.h, p3/M, z2.h, z12.h\n" + "fmla z20.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n" + "fmla z31.h, p3/M, z2.h, z10.h\n" + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x11, x5, LSL #1]\n" + "fmla z27.h, p3/M, z7.h, z11.h\n" + "fmla z26.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z4.h, z11.h\n" + "fmla z22.h, p3/M, z3.h, z11.h\n" + "fmla z19.h, p3/M, z1.h, z11.h\n" + "fmla z18.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z12.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "fmla z28.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x5, LSL #1]\n" + "fmla z31.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n" + "fmla z25.h, p3/M, z8.h, z11.h\n" + "fmla z24.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z5.h, z11.h\n" + "fmla z20.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z2.h, z11.h\n" + "fmla z16.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x5, LSL #1]\n" + "fmla z28.h, p3/M, z8.h, z12.h\n" + "fmla z24.h, p3/M, z5.h, z12.h\n" + "fmla z20.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x26, x5, LSL #1]\n" + "fmla z27.h, p3/M, z6.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z19.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n" + "fmla z22.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z8.h, z11.h\n" + "fmla z19.h, p3/M, z5.h, z11.h\n" + "fmla z18.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n" + "fmla z24.h, p3/M, z8.h, z12.h\n" + "fmla z20.h, p3/M, z5.h, z12.h\n" + "fmla z16.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n" + "fmla z19.h, p3/M, z8.h, z10.h\n" + "fmla z18.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x8, x5, LSL #1]\n" + "fmla z22.h, p3/M, z8.h, z11.h\n" + "fmla z21.h, p3/M, z7.h, z11.h\n" + "fmla z20.h, p3/M, z6.h, z11.h\n" + "fmla z18.h, p3/M, z5.h, z11.h\n" + "fmla z17.h, p3/M, z4.h, z11.h\n" + "fmla z16.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x17, x5, LSL #1]\n" + "fmla z31.h, p3/M, z4.h, z10.h\n" + "ldp x8, x17, [x4, #0x0]\n" + "fmla z18.h, p3/M, z8.h, z12.h\n" + "ld1h { z9.h }, p1/Z, [x8, x6, LSL #1]\n" + "fmla z17.h, p3/M, z7.h, z12.h\n" + "fmla z16.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x16, x5, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z1.h, z10.h\n" + "fmla z26.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x15, x5, LSL #1]\n" + "inch x5\n" + "fmla z29.h, p3/M, z5.h, z11.h\n" + "ldp x16, x15, [x4, #0x10]\n" + "whilelt p2.h, x5, %x[n_channels]\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x3, #1, MUL VL]\n" // Load from weights and bias + "fmla z25.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x3, #3, MUL VL]\n" // Load from weights and bias + "fmla z24.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x16, x6, LSL #1]\n" + "fmla z23.h, p3/M, z7.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x3, #2, MUL VL]\n" // Load from weights and bias + "fmla z22.h, p3/M, z6.h, z12.h\n" + "ld1h { z6.h }, p3/Z, [x3, #7, MUL VL]\n" // Load from weights and bias + "fmla z19.h, p3/M, z4.h, z12.h\n" + "fmla z18.h, p3/M, z3.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x15, x6, LSL #1]\n" + "fmla z21.h, p3/M, z8.h, z10.h\n" + "ld1h { z3.h }, p3/Z, [x3, #4, MUL VL]\n" // Load from weights and bias + "fmla z20.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z5.h, z10.h\n" + "ld1h { z5.h }, p3/Z, [x3, #6, MUL VL]\n" // Load from weights and bias + "fmla z16.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p1/Z, [x17, x6, LSL #1]\n" + "inch x6\n" + "fmax z31.h, p3/M, z31.h, z15.h\n" + "ld1h { z4.h }, p3/Z, [x3, #5, MUL VL]\n" // Load from weights and bias + "addvl x3, x3, #16\n" + "fmax z30.h, p3/M, z30.h, z15.h\n" + "ld1h { z7.h }, p3/Z, [x3, #-8, MUL VL]\n" // Load from weights and bias + "cmp x6, %x[n_channels]\n" + "fmax z29.h, p3/M, z29.h, z15.h\n" + "ld1h { z8.h }, p3/Z, [x3, #-7, MUL VL]\n" // Load from weights and bias + "addvl x3, x3, #-6\n" + "fmax z28.h, p3/M, z28.h, z15.h\n" + "fmax z27.h, p3/M, z27.h, z15.h\n" + "fmin z31.h, p3/M, z31.h, z14.h\n" + "st1h { z31.h }, p0, [x22, x7, LSL #1]\n" + "mov z31.d, z13.d\n" + "fmin z30.h, p3/M, z30.h, z14.h\n" + "ldr x22, [x2, #0x20]\n" + "fmin z29.h, p3/M, z29.h, z14.h\n" + "st1h { z30.h }, p0, [x21, x7, LSL #1]\n" + "mov z30.d, z13.d\n" + "fmin z28.h, p3/M, z28.h, z14.h\n" + "st1h { z29.h }, p0, [x20, x7, LSL #1]\n" + "mov z29.d, z13.d\n" + "ldr x21, [x2, #0x28]\n" + "fmin z27.h, p3/M, z27.h, z14.h\n" + "ldr x20, [x2, #0x30]\n" + "fmax z26.h, p3/M, z26.h, z15.h\n" + "st1h { z28.h }, p0, [x19, x7, LSL #1]\n" + "mov z28.d, z13.d\n" + "ldr x19, [x2, #0x38]\n" + "fmax z25.h, p3/M, z25.h, z15.h\n" + "st1h { z27.h }, p0, [x22, x7, LSL #1]\n" + "mov z27.d, z13.d\n" + "ldr x22, [x2, #0x40]\n" + "fmin z26.h, p3/M, z26.h, z14.h\n" + "st1h { z26.h }, p0, [x21, x7, LSL #1]\n" + "mov z26.d, z13.d\n" + "fmin z25.h, p3/M, z25.h, z14.h\n" + "ldr x21, [x2, #0x48]\n" + "fmax z24.h, p3/M, z24.h, z15.h\n" + "st1h { z25.h }, p0, [x20, x7, LSL #1]\n" + "mov z25.d, z13.d\n" + "fmax z23.h, p3/M, z23.h, z15.h\n" + "ldr x20, [x2, #0x50]\n" + "fmin z24.h, p3/M, z24.h, z14.h\n" + "st1h { z24.h }, p0, [x19, x7, LSL #1]\n" + "mov z24.d, z13.d\n" + "fmin z23.h, p3/M, z23.h, z14.h\n" + "ldr x19, [x2, #0x58]\n" + "fmax z22.h, p3/M, z22.h, z15.h\n" + "st1h { z23.h }, p0, [x22, x7, LSL #1]\n" + "mov z23.d, z13.d\n" + "fmax z21.h, p3/M, z21.h, z15.h\n" + "ldr x22, [x2, #0x60]\n" + "fmin z22.h, p3/M, z22.h, z14.h\n" + "st1h { z22.h }, p0, [x21, x7, LSL #1]\n" + "mov z22.d, z13.d\n" + "fmin z21.h, p3/M, z21.h, z14.h\n" + "ldr x21, [x2, #0x68]\n" + "fmax z20.h, p3/M, z20.h, z15.h\n" + "st1h { z21.h }, p0, [x20, x7, LSL #1]\n" + "mov z21.d, z13.d\n" + "fmax z19.h, p3/M, z19.h, z15.h\n" + "ldr x20, [x2, #0x70]\n" + "fmin z20.h, p3/M, z20.h, z14.h\n" + "st1h { z20.h }, p0, [x19, x7, LSL #1]\n" + "mov z20.d, z13.d\n" + "fmin z19.h, p3/M, z19.h, z14.h\n" + "ldr x19, [x2, #0x78]\n" + "fmax z18.h, p3/M, z18.h, z15.h\n" + "st1h { z19.h }, p0, [x22, x7, LSL #1]\n" + "mov z19.d, z13.d\n" + "fmax z17.h, p3/M, z17.h, z15.h\n" + "fmin z18.h, p3/M, z18.h, z14.h\n" + "st1h { z18.h }, p0, [x21, x7, LSL #1]\n" + "mov z18.d, z13.d\n" + "fmin z17.h, p3/M, z17.h, z14.h\n" + "st1h { z17.h }, p0, [x20, x7, LSL #1]\n" + "mov z17.d, z13.d\n" + "fmax z16.h, p3/M, z16.h, z15.h\n" + "fmin z16.h, p3/M, z16.h, z14.h\n" + "st1h { z16.h }, p0, [x19, x7, LSL #1]\n" + "mov z16.d, z13.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x14, [x4, #0x20]\n" + "inch x7\n" + "fmla z30.h, p3/M, z7.h, z9.h\n" + "ldr x13, [x4, #0x28]\n" + "mov p0.b, p2.b\n" + "fmla z29.h, p3/M, z6.h, z9.h\n" + "ldr x12, [x4, #0x30]\n" + "fmla z27.h, p3/M, z5.h, z9.h\n" + "ldr x11, [x4, #0x38]\n" + "fmla z26.h, p3/M, z4.h, z9.h\n" + "ldr x10, [x4, #0x40]\n" + "fmla z25.h, p3/M, z3.h, z9.h\n" + "ldr x9, [x4, #0x48]\n" + "fmla z23.h, p3/M, z2.h, z9.h\n" + "ldr x28, [x4, #0x50]\n" + "fmla z22.h, p3/M, z1.h, z9.h\n" + "ldr x27, [x4, #0x58]\n" + "fmla z21.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x12, x5, LSL #1]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n" + "fmla z30.h, p3/M, z8.h, z12.h\n" + "ldr x26, [x4, #0x60]\n" + "fmla z29.h, p3/M, z7.h, z12.h\n" + "ldr x25, [x4, #0x68]\n" + "fmla z26.h, p3/M, z5.h, z12.h\n" + "ldr x24, [x4, #0x70]\n" + "fmla z28.h, p3/M, z6.h, z12.h\n" + "ldr x23, [x4, #0x78]\n" + "fmla z25.h, p3/M, z4.h, z12.h\n" + "ldr x8, [x4, #0x80]\n" + "fmla z24.h, p3/M, z3.h, z12.h\n" + "ldr x17, [x4, #0x88]\n" + "fmla z22.h, p3/M, z2.h, z12.h\n" + "ldr x16, [x4, #0x90]\n" + "fmla z21.h, p3/M, z1.h, z12.h\n" + "ldr x15, [x4, #0x98]\n" + "fmla z20.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n" + "fmla z19.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x9, x5, LSL #1]\n" + "fmla z16.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n" + "fmla z27.h, p3/M, z8.h, z9.h\n" + "ldr x14, [x4, #0xa0]\n" + "fmla z26.h, p3/M, z7.h, z9.h\n" + "ldr x13, [x4, #0xa8]\n" + "fmla z25.h, p3/M, z6.h, z9.h\n" + "ldr x12, [x4, #0xb0]\n" + "fmla z23.h, p3/M, z5.h, z9.h\n" + "ldr x11, [x4, #0xb8]\n" + "fmla z22.h, p3/M, z4.h, z9.h\n" + "ldr x10, [x4, #0xc0]\n" + "fmla z21.h, p3/M, z3.h, z9.h\n" + "ldr x9, [x4, #0xc8]\n" + "fmla z19.h, p3/M, z2.h, z9.h\n" + "ldr x22, [x2, #0x0]\n" + "fmla z18.h, p3/M, z1.h, z9.h\n" + "ldr x21, [x2, #0x8]\n" + "fmla z17.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x28, x5, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z12.h\n" + "ldr x28, [x4, #0xd0]\n" + "fmla z30.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x5, LSL #1]\n" + "fmla z29.h, p3/M, z2.h, z11.h\n" + "ldr x27, [x4, #0xd8]\n" + "fmla z28.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x26, x5, LSL #1]\n" + "fmla z26.h, p3/M, z8.h, z10.h\n" + "ldr x26, [x4, #0xe0]\n" + "fmla z25.h, p3/M, z7.h, z10.h\n" + "ldr x20, [x2, #0x10]\n" + "fmla z24.h, p3/M, z6.h, z10.h\n" + "ldr x19, [x2, #0x18]\n" + "fmla z22.h, p3/M, z5.h, z10.h\n" + "fmla z21.h, p3/M, z4.h, z10.h\n" + "fmla z20.h, p3/M, z3.h, z10.h\n" + "fmla z18.h, p3/M, z2.h, z10.h\n" + "fmla z17.h, p3/M, z1.h, z10.h\n" + "fmla z16.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n" + "fmla z31.h, p3/M, z3.h, z9.h\n" + "ldr x25, [x4, #0xe8]\n" + "fmla z27.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n" + "fmla z23.h, p3/M, z6.h, z11.h\n" + "ldr x23, [x4, #0xf8]\n" + "fmla z19.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n" + "fmla z31.h, p3/M, z5.h, z10.h\n" + "ldr x24, [x4, #0xf0]\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z2.h, z10.h\n" + "fmla z26.h, p3/M, z1.h, z10.h\n" + "fmla z25.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x17, x5, LSL #1]\n" + "fmla z20.h, p3/M, z8.h, z11.h\n" + "ldr x17, [x4, #0x108]\n" + "fmla z16.h, p3/M, z5.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x8, x5, LSL #1]\n" + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ldr x8, [x4, #0x100]\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z26.h, p3/M, z2.h, z12.h\n" + "fmla z25.h, p3/M, z1.h, z12.h\n" + "fmla z24.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x15, x5, LSL #1]\n" + "fmla z19.h, p3/M, z7.h, z11.h\n" + "ldr x15, [x4, #0x118]\n" + "fmla z18.h, p3/M, z6.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x16, x5, LSL #1]\n" + "fmla z31.h, p3/M, z7.h, z10.h\n" + "ldr x16, [x4, #0x110]\n" + "fmla z30.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z4.h, z10.h\n" + "fmla z26.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z1.h, z10.h\n" + "fmla z22.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x14, x5, LSL #1]\n" + "fmla z17.h, p3/M, z8.h, z11.h\n" + "fmla z16.h, p3/M, z7.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x13, x5, LSL #1]\n" + "fmla z29.h, p3/M, z8.h, z12.h\n" + "fmla z28.h, p3/M, z7.h, z12.h\n" + "fmla z25.h, p3/M, z5.h, z12.h\n" + "fmla z24.h, p3/M, z4.h, z12.h\n" + "fmla z21.h, p3/M, z2.h, z12.h\n" + "fmla z20.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x12, x5, LSL #1]\n" + "fmla z31.h, p3/M, z2.h, z10.h\n" + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x11, x5, LSL #1]\n" + "fmla z27.h, p3/M, z7.h, z11.h\n" + "fmla z26.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z4.h, z11.h\n" + "fmla z22.h, p3/M, z3.h, z11.h\n" + "fmla z19.h, p3/M, z1.h, z11.h\n" + "fmla z18.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x5, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z12.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "fmla z28.h, p3/M, z0.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x5, LSL #1]\n" + "fmla z31.h, p3/M, z6.h, z10.h\n" + "fmla z27.h, p3/M, z3.h, z10.h\n" + "fmla z23.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x28, x5, LSL #1]\n" + "fmla z25.h, p3/M, z8.h, z11.h\n" + "fmla z24.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z5.h, z11.h\n" + "fmla z20.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z2.h, z11.h\n" + "fmla z16.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x5, LSL #1]\n" + "fmla z28.h, p3/M, z8.h, z12.h\n" + "fmla z24.h, p3/M, z5.h, z12.h\n" + "fmla z20.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x26, x5, LSL #1]\n" + "fmla z27.h, p3/M, z6.h, z10.h\n" + "fmla z23.h, p3/M, z3.h, z10.h\n" + "fmla z19.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x25, x5, LSL #1]\n" + "fmla z22.h, p3/M, z7.h, z11.h\n" + "fmla z21.h, p3/M, z6.h, z11.h\n" + "fmla z23.h, p3/M, z8.h, z11.h\n" + "fmla z19.h, p3/M, z5.h, z11.h\n" + "fmla z18.h, p3/M, z4.h, z11.h\n" + "fmla z17.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x5, LSL #1]\n" + "fmla z24.h, p3/M, z8.h, z12.h\n" + "fmla z20.h, p3/M, z5.h, z12.h\n" + "fmla z16.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x23, x5, LSL #1]\n" + "fmla z19.h, p3/M, z8.h, z10.h\n" + "fmla z18.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z6.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x8, x5, LSL #1]\n" + "fmla z22.h, p3/M, z8.h, z11.h\n" + "fmla z21.h, p3/M, z7.h, z11.h\n" + "fmla z20.h, p3/M, z6.h, z11.h\n" + "fmla z18.h, p3/M, z5.h, z11.h\n" + "fmla z17.h, p3/M, z4.h, z11.h\n" + "fmla z16.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x17, x5, LSL #1]\n" + "fmla z31.h, p3/M, z4.h, z10.h\n" + "fmla z18.h, p3/M, z8.h, z12.h\n" + "fmla z17.h, p3/M, z7.h, z12.h\n" + "fmla z16.h, p3/M, z6.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x16, x5, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z10.h\n" + "fmla z27.h, p3/M, z1.h, z10.h\n" + "fmla z26.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x15, x5, LSL #1]\n" + "fmla z29.h, p3/M, z5.h, z11.h\n" + "fmla z28.h, p3/M, z4.h, z11.h\n" + "fmla z25.h, p3/M, z2.h, z11.h\n" + "fmla z24.h, p3/M, z1.h, z11.h\n" + "fmla z23.h, p3/M, z7.h, z12.h\n" + "fmla z22.h, p3/M, z6.h, z12.h\n" + "fmla z19.h, p3/M, z4.h, z12.h\n" + "fmla z18.h, p3/M, z3.h, z12.h\n" + "fmla z21.h, p3/M, z8.h, z10.h\n" + "fmla z20.h, p3/M, z7.h, z10.h\n" + "fmla z17.h, p3/M, z5.h, z10.h\n" + "fmla z16.h, p3/M, z4.h, z10.h\n" + "fmax z31.h, p3/M, z31.h, z15.h\n" + "fmax z30.h, p3/M, z30.h, z15.h\n" + "fmax z29.h, p3/M, z29.h, z15.h\n" + "fmax z28.h, p3/M, z28.h, z15.h\n" + "fmin z31.h, p3/M, z31.h, z14.h\n" + "st1h { z31.h }, p0, [x22, x7, LSL #1]\n" + "fmin z30.h, p3/M, z30.h, z14.h\n" + "fmin z29.h, p3/M, z29.h, z14.h\n" + "ldr x22, [x2, #0x20]\n" + "fmin z28.h, p3/M, z28.h, z14.h\n" + "st1h { z30.h }, p0, [x21, x7, LSL #1]\n" + "fmax z27.h, p3/M, z27.h, z15.h\n" + "fmax z26.h, p3/M, z26.h, z15.h\n" + "st1h { z29.h }, p0, [x20, x7, LSL #1]\n" + "fmax z25.h, p3/M, z25.h, z15.h\n" + "st1h { z28.h }, p0, [x19, x7, LSL #1]\n" + "fmax z24.h, p3/M, z24.h, z15.h\n" + "ldr x21, [x2, #0x28]\n" + "fmax z23.h, p3/M, z23.h, z15.h\n" + "ldr x20, [x2, #0x30]\n" + "fmin z27.h, p3/M, z27.h, z14.h\n" + "ldr x19, [x2, #0x38]\n" + "fmin z26.h, p3/M, z26.h, z14.h\n" + "st1h { z27.h }, p0, [x22, x7, LSL #1]\n" + "fmin z25.h, p3/M, z25.h, z14.h\n" + "fmin z24.h, p3/M, z24.h, z14.h\n" + "st1h { z26.h }, p0, [x21, x7, LSL #1]\n" + "fmin z23.h, p3/M, z23.h, z14.h\n" + "ldr x22, [x2, #0x40]\n" + "fmax z22.h, p3/M, z22.h, z15.h\n" + "ldr x21, [x2, #0x48]\n" + "fmax z21.h, p3/M, z21.h, z15.h\n" + "st1h { z25.h }, p0, [x20, x7, LSL #1]\n" + "fmax z20.h, p3/M, z20.h, z15.h\n" + "st1h { z24.h }, p0, [x19, x7, LSL #1]\n" + "fmax z19.h, p3/M, z19.h, z15.h\n" + "st1h { z23.h }, p0, [x22, x7, LSL #1]\n" + "fmin z22.h, p3/M, z22.h, z14.h\n" + "ldr x20, [x2, #0x50]\n" + "fmin z21.h, p3/M, z21.h, z14.h\n" + "ldr x19, [x2, #0x58]\n" + "fmin z20.h, p3/M, z20.h, z14.h\n" + "ldr x22, [x2, #0x60]\n" + "fmin z19.h, p3/M, z19.h, z14.h\n" + "st1h { z22.h }, p0, [x21, x7, LSL #1]\n" + "fmax z18.h, p3/M, z18.h, z15.h\n" + "st1h { z21.h }, p0, [x20, x7, LSL #1]\n" + "fmax z17.h, p3/M, z17.h, z15.h\n" + "st1h { z20.h }, p0, [x19, x7, LSL #1]\n" + "fmax z16.h, p3/M, z16.h, z15.h\n" + "st1h { z19.h }, p0, [x22, x7, LSL #1]\n" + "ldr x21, [x2, #0x68]\n" + "fmin z18.h, p3/M, z18.h, z14.h\n" + "ldr x20, [x2, #0x70]\n" + "fmin z17.h, p3/M, z17.h, z14.h\n" + "ldr x19, [x2, #0x78]\n" + "fmin z16.h, p3/M, z16.h, z14.h\n" + "st1h { z18.h }, p0, [x21, x7, LSL #1]\n" + "st1h { z17.h }, p0, [x20, x7, LSL #1]\n" + "st1h { z16.h }, p0, [x19, x7, LSL #1]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..98f50f8436 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + + sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..e620604a16 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x7, #0x0\n" + "mov x8, #0x0\n" + "1:" // Tile loop + "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x23, #0x4\n" + "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x17, #0x2\n" + "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x15, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cnth x14\n" + "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x12, XZR, x14\n" + "ldr x21, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x7, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x8, x13, x19\n" // offset += tile_j * ld_input_col + "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x23\n" // offset *= kernel_stride * output_size + "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x21, x21, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x9, x21, x22, LSL #1\n" + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x28, x9, x22, LSL #1\n" + "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias + "mov z31.d, z17.d\n" + "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias + "add x27, x28, x22, LSL #1\n" + "mov z30.d, z17.d\n" + "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias + "add x26, x27, x22, LSL #1\n" + "mov z29.d, z17.d\n" + "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias + "add x25, x13, x13\n" + "mov z28.d, z17.d\n" + "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias + "add x24, x25, x13\n" + "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias + "add x23, x24, x13\n" + "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias + "mul x19, x7, x20\n" // offset = tile_i * ld_output_row + "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias + "madd x19, x8, x11, x19\n" // offset += tile_j * ld_output_col + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z9.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (2, 2) + "ld1h { z10.h }, p2/Z, [x21]\n" // Load input point (0, 0) + "mul x19, x19, x17\n" // offset *= output_tile_size + "ld1h { z11.h }, p2/Z, [x21, x13, LSL #1]\n" // Load input point (0, 1) + "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "ld1h { z12.h }, p2/Z, [x21, x24, LSL #1]\n" // Load input point (0, 3) + "add x22, x10, x20, LSL #1\n" + "ld1h { z13.h }, p2/Z, [x21, x23, LSL #1]\n" // Load input point (0, 4) + "addvl x16, x16, #16\n" + "ld1h { z14.h }, p2/Z, [x9]\n" // Load input point (1, 0) + "cmp x14, %x[n_channels]\n" + "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias + "addvl x16, x16, #-6\n" + "ld1h { z15.h }, p2/Z, [x9, x13, LSL #1]\n" // Load input point (1, 1) + "ld1h { z16.h }, p2/Z, [x21, x25, LSL #1]\n" // Load input point (0, 2) + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias + "whilelt p1.h, x14, %x[n_channels]\n" + "fmla z30.h, p3/M, z6.h, z9.h\n" + "inch x12\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "inch x15\n" + "addvl x21, x21, #1\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ld1h { z10.h }, p1/Z, [x21]\n" // Load input point (0, 0) + "fmla z30.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (1, 4) + "inch x14\n" + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (1, 3) + "fmla z30.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (1, 2) + "addvl x9, x9, #1\n" + "fmla z31.h, p3/M, z3.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x27]\n" // Load input point (3, 0) + "fmla z30.h, p3/M, z0.h, z16.h\n" + "fmla z29.h, p3/M, z3.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (3, 4) + "fmla z31.h, p3/M, z4.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x28]\n" // Load input point (2, 0) + "fmla z30.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n" // Load input point (3, 1) + "fmla z29.h, p3/M, z0.h, z15.h\n" + "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n" // Load input point (2, 1) + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (2, 3) + "fmla z29.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (2, 4) + "addvl x28, x28, #1\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "ld1h { z9.h }, p1/Z, [x28, x25, LSL #1]\n" // Load input point (2, 2) + "fmla z30.h, p3/M, z3.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (3, 3) + "fmla z29.h, p3/M, z1.h, z16.h\n" + "fmla z31.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x26]\n" // Load input point (4, 0) + "fmla z28.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n" // Load input point (4, 1) + "fmla z30.h, p3/M, z7.h, z12.h\n" + "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n" // Load input point (4, 2) + "fmla z31.h, p3/M, z7.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (3, 2) + "addvl x27, x27, #1\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x21, x24, LSL #1]\n" // Load input point (0, 3) + "fmla z30.h, p3/M, z8.h, z11.h\n" + "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z7.h, z13.h\n" + "ld1h { z13.h }, p1/Z, [x21, x23, LSL #1]\n" // Load input point (0, 4) + "fmax z31.h, p3/M, z31.h, z19.h\n" + "fmla z28.h, p3/M, z5.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (4, 3) + "fmax z30.h, p3/M, z30.h, z19.h\n" + "fmla z29.h, p3/M, z5.h, z16.h\n" + "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias + "fmin z31.h, p3/M, z31.h, z18.h\n" + "st1h { z31.h }, p0, [x10]\n" // Store output point (0, 0) + "mov z31.d, z17.d\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (4, 4) + "whilelt p2.h, x15, %x[n_channels]\n" + "fmla z29.h, p3/M, z8.h, z15.h\n" + "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias + "addvl x26, x26, #1\n" + "fmin z30.h, p3/M, z30.h, z18.h\n" + "st1h { z30.h }, p0, [x10, x11, LSL #1]\n" // Store output point (0, 1) + "mov z30.d, z17.d\n" + "addvl x10, x10, #1\n" + "fmla z28.h, p3/M, z3.h, z16.h\n" + "ld1h { z16.h }, p1/Z, [x21, x25, LSL #1]\n" // Load input point (0, 2) + "cmp x14, %x[n_channels]\n" + "fmax z29.h, p3/M, z29.h, z19.h\n" + "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z7.h, z14.h\n" + "ld1h { z14.h }, p1/Z, [x9]\n" // Load input point (1, 0) + "fmin z29.h, p3/M, z29.h, z18.h\n" + "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0) + "mov z29.d, z17.d\n" + "fmla z28.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p1/Z, [x9, x13, LSL #1]\n" // Load input point (1, 1) + "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n" // Load input point (0, 1) + "addvl x16, x16, #16\n" + "fmax z28.h, p3/M, z28.h, z19.h\n" + "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias + "fmin z28.h, p3/M, z28.h, z18.h\n" + "st1h { z28.h }, p0, [x22, x11, LSL #1]\n" // Store output point (1, 1) + "mov z28.d, z17.d\n" + "addvl x22, x22, #1\n" + "addvl x16, x16, #-6\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov p0.b, p2.b\n" + "fmla z30.h, p3/M, z6.h, z9.h\n" + "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x21, x7, #0x1\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "add x8, x8, #0x1\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "cmp x8, x19\n" + "fmla z30.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (1, 4) + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (1, 3) + "csel x8, x8, XZR, LT\n" + "fmla z30.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (1, 2) + "csel x7, x7, x21, LT\n" + "fmla z31.h, p3/M, z3.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x27]\n" // Load input point (3, 0) + "cmp x7, x20\n" + "fmla z30.h, p3/M, z0.h, z16.h\n" + "fmla z29.h, p3/M, z3.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (3, 4) + "fmla z31.h, p3/M, z4.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x28]\n" // Load input point (2, 0) + "fmla z30.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x13, LSL #1]\n" // Load input point (3, 1) + "fmla z29.h, p3/M, z0.h, z15.h\n" + "fmla z31.h, p3/M, z2.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x28, x13, LSL #1]\n" // Load input point (2, 1) + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (2, 3) + "fmla z29.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (2, 4) + "fmla z31.h, p3/M, z5.h, z13.h\n" + "fmla z30.h, p3/M, z3.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (3, 3) + "fmla z29.h, p3/M, z1.h, z16.h\n" + "fmla z31.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x26]\n" // Load input point (4, 0) + "fmla z28.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x26, x13, LSL #1]\n" // Load input point (4, 1) + "fmla z30.h, p3/M, z7.h, z12.h\n" + "fmla z29.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x26, x25, LSL #1]\n" // Load input point (4, 2) + "fmla z31.h, p3/M, z7.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (3, 2) + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z30.h, p3/M, z8.h, z11.h\n" + "fmla z29.h, p3/M, z7.h, z13.h\n" + "fmax z31.h, p3/M, z31.h, z19.h\n" + "fmla z28.h, p3/M, z5.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x26, x24, LSL #1]\n" // Load input point (4, 3) + "fmax z30.h, p3/M, z30.h, z19.h\n" + "fmla z29.h, p3/M, z5.h, z16.h\n" + "fmin z31.h, p3/M, z31.h, z18.h\n" + "st1h { z31.h }, p0, [x10]\n" // Store output point (0, 0) + "fmla z28.h, p3/M, z2.h, z11.h\n" + "fmla z29.h, p3/M, z8.h, z15.h\n" + "ld1h { z11.h }, p2/Z, [x26, x23, LSL #1]\n" // Load input point (4, 4) + "fmin z30.h, p3/M, z30.h, z18.h\n" + "st1h { z30.h }, p0, [x10, x11, LSL #1]\n" // Store output point (0, 1) + "fmla z28.h, p3/M, z3.h, z16.h\n" + "fmax z29.h, p3/M, z29.h, z19.h\n" + "fmla z28.h, p3/M, z7.h, z14.h\n" + "fmin z29.h, p3/M, z29.h, z18.h\n" + "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0) + "fmla z28.h, p3/M, z6.h, z15.h\n" + "fmla z28.h, p3/M, z8.h, z11.h\n" + "fmax z28.h, p3/M, z28.h, z19.h\n" + "fmin z28.h, p3/M, z28.h, z18.h\n" + "st1h { z28.h }, p0, [x22, x11, LSL #1]\n" // Store output point (1, 1) + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..3ed743e3ed --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[25]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[1]; + inptrs[3] = input_ptrs[3]; + inptrs[4] = input_ptrs[4]; + inptrs[5] = input_ptrs[5]; + inptrs[6] = input_ptrs[6]; + inptrs[7] = input_ptrs[2]; + inptrs[8] = input_ptrs[8]; + inptrs[9] = input_ptrs[9]; + inptrs[10] = input_ptrs[7]; + inptrs[11] = input_ptrs[15]; + inptrs[12] = input_ptrs[10]; + inptrs[13] = input_ptrs[16]; + inptrs[14] = input_ptrs[11]; + inptrs[15] = input_ptrs[18]; + inptrs[16] = input_ptrs[13]; + inptrs[17] = input_ptrs[19]; + inptrs[18] = input_ptrs[20]; + inptrs[19] = input_ptrs[14]; + inptrs[20] = input_ptrs[21]; + inptrs[21] = input_ptrs[17]; + inptrs[22] = input_ptrs[23]; + inptrs[23] = input_ptrs[22]; + inptrs[24] = input_ptrs[24]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n" + "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "mov x14, #0x0\n" + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "cnth x13\n" + "ldp x12, x11, [x19, #0x0]\n" + "sub x10, XZR, x13\n" + "ldp x9, x28, [x19, #0x10]\n" + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias + "mov z31.d, z17.d\n" + "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias + "cmp x13, %x[n_channels]\n" + "mov z30.d, z17.d\n" + "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias + "mov z29.d, z17.d\n" + "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias + "mov z28.d, z17.d\n" + "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias + "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias + "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias + "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias + "addvl x16, x16, #16\n" + "ldp x27, x26, [x15, #0x0]\n" + "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias + "addvl x16, x16, #-6\n" + "ld1h { z9.h }, p2/Z, [x27, x14, LSL #1]\n" + "ld1h { z10.h }, p2/Z, [x26, x14, LSL #1]\n" + "ldp x25, x23, [x15, #0x10]\n" + "ldp x22, x21, [x15, #0x20]\n" + "ldp x20, x19, [x15, #0x30]\n" + "ld1h { z11.h }, p2/Z, [x25, x14, LSL #1]\n" + "ld1h { z12.h }, p2/Z, [x23, x14, LSL #1]\n" + "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n" + "ld1h { z14.h }, p2/Z, [x21, x14, LSL #1]\n" + "ld1h { z15.h }, p2/Z, [x20, x14, LSL #1]\n" + "ld1h { z16.h }, p2/Z, [x19, x14, LSL #1]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x24, [x15, #0x40]\n" + "whilelt p1.h, x13, %x[n_channels]\n" + "fmla z30.h, p3/M, z6.h, z9.h\n" + "ldr x20, [x15, #0x48]\n" + "inch x10\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ldr x23, [x15, #0x50]\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ldr x19, [x15, #0x58]\n" + "ldr x22, [x15, #0x60]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ldr x21, [x15, #0x68]\n" + "fmla z30.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x20, x14, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n" + "ldr x20, [x15, #0x70]\n" + "fmla z30.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n" + "fmla z31.h, p3/M, z3.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x19, x14, LSL #1]\n" + "ldr x19, [x15, #0x78]\n" + "fmla z30.h, p3/M, z0.h, z16.h\n" + "ldr x27, [x15, #0x80]\n" + "fmla z29.h, p3/M, z3.h, z14.h\n" + "ldr x26, [x15, #0x88]\n" + "ldr x25, [x15, #0x90]\n" + "fmla z31.h, p3/M, z4.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x22, x14, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x21, x14, LSL #1]\n" + "fmla z29.h, p3/M, z0.h, z15.h\n" + "ld1h { z14.h }, p2/Z, [x26, x14, LSL #1]\n" + "ldr x23, [x15, #0x98]\n" + "fmla z31.h, p3/M, z2.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n" + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x23, x14, LSL #1]\n" + "ldr x22, [x15, #0xa0]\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "ldr x21, [x15, #0xa8]\n" + "fmla z30.h, p3/M, z3.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x19, x14, LSL #1]\n" + "fmla z29.h, p3/M, z1.h, z16.h\n" + "ldr x20, [x15, #0xb0]\n" + "ldr x19, [x15, #0xb8]\n" + "fmla z31.h, p3/M, z6.h, z15.h\n" + "fmla z28.h, p3/M, z4.h, z13.h\n" + "ld1h { z15.h }, p2/Z, [x25, x14, LSL #1]\n" + "fmla z30.h, p3/M, z7.h, z12.h\n" + "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n" + "ldr x24, [x15, #0xc0]\n" + "fmla z31.h, p3/M, z7.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ldp x27, x26, [x15, #0x0]\n" + "fmla z29.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x19, x14, LSL #1]\n" + "fmla z30.h, p3/M, z8.h, z11.h\n" + "ldp x25, x23, [x15, #0x10]\n" + "ldp x22, x21, [x15, #0x20]\n" + "fmla z28.h, p3/M, z5.h, z14.h\n" + "fmax z31.h, p3/M, z31.h, z19.h\n" + "ld1h { z14.h }, p2/Z, [x20, x14, LSL #1]\n" + "fmla z29.h, p3/M, z7.h, z13.h\n" + "ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n" + "fmax z30.h, p3/M, z30.h, z19.h\n" + "ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n" + "ld1h { z12.h }, p1/Z, [x23, x13, LSL #1]\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "fmin z31.h, p3/M, z31.h, z18.h\n" + "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n" + "inch x14\n" + "fmla z29.h, p3/M, z5.h, z16.h\n" + "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n" + "whilelt p2.h, x14, %x[n_channels]\n" + "fmin z30.h, p3/M, z30.h, z18.h\n" + "ldp x20, x19, [x15, #0x30]\n" + "ld1h { z17.h }, p3/Z, [x16]\n" // Load from weights and bias + "fmla z28.h, p3/M, z3.h, z16.h\n" + "st1h { z31.h }, p0, [x12, x10, LSL #1]\n" + "mov z31.d, z17.d\n" + "ld1h { z16.h }, p1/Z, [x19, x13, LSL #1]\n" + "fmla z29.h, p3/M, z8.h, z15.h\n" + "st1h { z30.h }, p0, [x11, x10, LSL #1]\n" + "mov z30.d, z17.d\n" + "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z7.h, z14.h\n" + "ld1h { z14.h }, p1/Z, [x21, x13, LSL #1]\n" + "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n" // Load from weights and bias + "fmax z29.h, p3/M, z29.h, z19.h\n" + "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p1/Z, [x20, x13, LSL #1]\n" + "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n" // Load from weights and bias + "fmin z29.h, p3/M, z29.h, z18.h\n" + "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n" // Load from weights and bias + "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n" // Load from weights and bias + "fmla z28.h, p3/M, z8.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n" + "inch x13\n" + "fmax z28.h, p3/M, z28.h, z19.h\n" + "st1h { z29.h }, p0, [x9, x10, LSL #1]\n" + "cmp x13, %x[n_channels]\n" + "mov z29.d, z17.d\n" + "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n" // Load from weights and bias + "addvl x16, x16, #16\n" + "fmin z28.h, p3/M, z28.h, z18.h\n" + "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n" // Load from weights and bias + "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n" // Load from weights and bias + "addvl x16, x16, #-6\n" + "st1h { z28.h }, p0, [x28, x10, LSL #1]\n" + "mov z28.d, z17.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.h, p3/M, z8.h, z9.h\n" + "ldr x24, [x15, #0x40]\n" + "inch x10\n" + "fmla z30.h, p3/M, z6.h, z9.h\n" + "ldr x20, [x15, #0x48]\n" + "mov p0.b, p2.b\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ldr x23, [x15, #0x50]\n" + "fmla z28.h, p3/M, z0.h, z9.h\n" + "ldr x19, [x15, #0x58]\n" + "ldr x22, [x15, #0x60]\n" + "fmla z31.h, p3/M, z0.h, z10.h\n" + "ldr x21, [x15, #0x68]\n" + "fmla z30.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x20, x14, LSL #1]\n" + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n" + "ldr x20, [x15, #0x70]\n" + "fmla z30.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x23, x14, LSL #1]\n" + "fmla z31.h, p3/M, z3.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x19, x14, LSL #1]\n" + "ldr x19, [x15, #0x78]\n" + "fmla z30.h, p3/M, z0.h, z16.h\n" + "ldr x27, [x15, #0x80]\n" + "fmla z29.h, p3/M, z3.h, z14.h\n" + "ldr x26, [x15, #0x88]\n" + "ldr x25, [x15, #0x90]\n" + "fmla z31.h, p3/M, z4.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x22, x14, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x21, x14, LSL #1]\n" + "fmla z29.h, p3/M, z0.h, z15.h\n" + "ld1h { z14.h }, p2/Z, [x26, x14, LSL #1]\n" + "ldr x23, [x15, #0x98]\n" + "fmla z31.h, p3/M, z2.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x20, x14, LSL #1]\n" + "fmla z30.h, p3/M, z5.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x14, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x23, x14, LSL #1]\n" + "ldr x22, [x15, #0xa0]\n" + "fmla z31.h, p3/M, z5.h, z13.h\n" + "ldr x21, [x15, #0xa8]\n" + "fmla z30.h, p3/M, z3.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x19, x14, LSL #1]\n" + "fmla z29.h, p3/M, z1.h, z16.h\n" + "ldr x20, [x15, #0xb0]\n" + "ldr x19, [x15, #0xb8]\n" + "fmla z31.h, p3/M, z6.h, z15.h\n" + "fmla z28.h, p3/M, z4.h, z13.h\n" + "ld1h { z15.h }, p2/Z, [x25, x14, LSL #1]\n" + "fmla z30.h, p3/M, z7.h, z12.h\n" + "ld1h { z13.h }, p2/Z, [x22, x14, LSL #1]\n" + "ldr x24, [x15, #0xc0]\n" + "fmla z31.h, p3/M, z7.h, z16.h\n" + "ld1h { z16.h }, p2/Z, [x21, x14, LSL #1]\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "fmla z29.h, p3/M, z6.h, z15.h\n" + "ld1h { z15.h }, p2/Z, [x19, x14, LSL #1]\n" + "fmla z30.h, p3/M, z8.h, z11.h\n" + "fmla z28.h, p3/M, z5.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x20, x14, LSL #1]\n" + "fmax z31.h, p3/M, z31.h, z19.h\n" + "fmla z29.h, p3/M, z7.h, z13.h\n" + "fmax z30.h, p3/M, z30.h, z19.h\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n" + "fmin z31.h, p3/M, z31.h, z18.h\n" + "st1h { z31.h }, p0, [x12, x10, LSL #1]\n" + "fmla z29.h, p3/M, z5.h, z16.h\n" + "fmla z28.h, p3/M, z3.h, z16.h\n" + "fmin z30.h, p3/M, z30.h, z18.h\n" + "st1h { z30.h }, p0, [x11, x10, LSL #1]\n" + "fmla z28.h, p3/M, z7.h, z14.h\n" + "fmla z29.h, p3/M, z8.h, z15.h\n" + "fmla z28.h, p3/M, z6.h, z15.h\n" + "fmax z29.h, p3/M, z29.h, z19.h\n" + "fmla z28.h, p3/M, z8.h, z11.h\n" + "fmin z29.h, p3/M, z29.h, z18.h\n" + "st1h { z29.h }, p0, [x9, x10, LSL #1]\n" + "fmax z28.h, p3/M, z28.h, z19.h\n" + "fmin z28.h, p3/M, z28.h, z18.h\n" + "st1h { z28.h }, p0, [x28, x10, LSL #1]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..20f3ee0329 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); +void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + +struct sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef __fp16 bias_type; + typedef __fp16 input_type; + typedef __fp16 weight_type; + typedef __fp16 return_type; + + typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + + sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..f1ee5c53ce --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const __fp16 *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + __fp16 *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const __fp16 min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const __fp16 *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + __fp16 *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x5, #0x0\n" + "mov x6, #0x0\n" + "1:" // Tile loop + "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x20, #0x2\n" + "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x7, #0x2\n" + "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x17, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cnth x16\n" + "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x14, XZR, x16\n" + "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x5, x22\n" // offset = tile_i * ld_input_row + "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col + "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x20\n" // offset *= kernel_stride * output_size + "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x13, x13, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16) + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x20, x13, x22, LSL #1\n" + "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x10, x20, x22, LSL #1\n" + "ld1h { z16.h }, p3/Z, [x8]\n" // Load from weights and bias + "mov z31.d, z16.d\n" + "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "add x9, x10, x22, LSL #1\n" + "mov z30.d, z16.d\n" + "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "add x28, x9, x22, LSL #1\n" + "mov z29.d, z16.d\n" + "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "add x27, x28, x22, LSL #1\n" + "mov z28.d, z16.d\n" + "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias + "add x26, x15, x15\n" + "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias + "add x25, x26, x15\n" + "mul x19, x5, x21\n" // offset = tile_i * ld_output_row + "add x24, x25, x15\n" + "add x23, x24, x15\n" + "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x7\n" // offset *= output_tile_size + "add x11, x11, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16) + "add x22, x11, x21, LSL #1\n" + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z5.h }, p2/Z, [x13]\n" // Load input point (0, 0) + "ld1h { z6.h }, p2/Z, [x13, x15, LSL #1]\n" // Load input point (0, 1) + "cmp x16, %x[n_channels]\n" + "ld1h { z7.h }, p2/Z, [x20]\n" // Load input point (1, 0) + "addvl x8, x8, #6\n" + "ld1h { z8.h }, p2/Z, [x20, x15, LSL #1]\n" // Load input point (1, 1) + "ld1h { z9.h }, p2/Z, [x13, x26, LSL #1]\n" // Load input point (0, 2) + "ld1h { z13.h }, p2/Z, [x20, x26, LSL #1]\n" // Load input point (1, 2) + "ld1h { z11.h }, p2/Z, [x13, x25, LSL #1]\n" // Load input point (0, 3) + "ld1h { z12.h }, p2/Z, [x13, x24, LSL #1]\n" // Load input point (0, 4) + "ld1h { z10.h }, p2/Z, [x20, x23, LSL #1]\n" // Load input point (1, 5) + "ld1h { z14.h }, p2/Z, [x10]\n" // Load input point (2, 0) + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n" // Load input point (1, 3) + "whilelt p1.h, x16, %x[n_channels]\n" + "fmla z30.h, p3/M, z0.h, z6.h\n" + "inch x14\n" + "fmla z29.h, p3/M, z0.h, z7.h\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z0.h, z8.h\n" + "ld1h { z0.h }, p3/Z, [x8]\n" // Load from weights and bias + "inch x17\n" + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n" // Load input point (1, 4) + "addvl x20, x20, #1\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "inch x16\n" + "fmla z29.h, p3/M, z1.h, z8.h\n" + "fmla z28.h, p3/M, z1.h, z13.h\n" + "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n" // Load input point (0, 5) + "addvl x13, x13, #1\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "fmla z29.h, p3/M, z2.h, z13.h\n" + "fmla z28.h, p3/M, z2.h, z5.h\n" + "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1) + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z5.h\n" + "fmla z28.h, p3/M, z3.h, z6.h\n" + "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 2) + "fmla z30.h, p3/M, z4.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 3) + "fmla z29.h, p3/M, z4.h, z6.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z7.h\n" + "ld1h { z7.h }, p1/Z, [x20]\n" // Load input point (1, 0) + "fmla z30.h, p3/M, z0.h, z8.h\n" + "fmla z29.h, p3/M, z0.h, z14.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (2, 5) + "fmla z30.h, p3/M, z1.h, z13.h\n" + "fmla z29.h, p3/M, z1.h, z11.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (2, 4) + "addvl x10, x10, #1\n" + "fmla z30.h, p3/M, z2.h, z5.h\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "fmla z28.h, p3/M, z2.h, z9.h\n" + "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias + "addvl x8, x8, #16\n" + "fmla z31.h, p3/M, z3.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x9]\n" // Load input point (3, 0) + "ld1h { z16.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias + "fmla z30.h, p3/M, z3.h, z6.h\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1) + "fmla z30.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 2) + "fmla z29.h, p3/M, z4.h, z13.h\n" + "fmla z28.h, p3/M, z4.h, z8.h\n" + "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (3, 5) + "fmla z30.h, p3/M, z0.h, z11.h\n" + "fmla z29.h, p3/M, z0.h, z5.h\n" + "fmla z28.h, p3/M, z0.h, z6.h\n" + "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 3) + "fmla z30.h, p3/M, z1.h, z12.h\n" + "fmla z29.h, p3/M, z1.h, z6.h\n" + "fmla z28.h, p3/M, z1.h, z10.h\n" + "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (3, 4) + "addvl x9, x9, #1\n" + "fmla z30.h, p3/M, z2.h, z9.h\n" + "fmla z29.h, p3/M, z2.h, z10.h\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x28]\n" // Load input point (4, 0) + "fmla z30.h, p3/M, z3.h, z13.h\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1) + "fmla z30.h, p3/M, z4.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (4, 4) + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z4.h, z14.h\n" + "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 2) + "fmla z30.h, p3/M, z0.h, z6.h\n" + "fmla z29.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z0.h, z13.h\n" + "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 3) + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z1.h, z13.h\n" + "fmla z28.h, p3/M, z1.h, z5.h\n" + "ld1h { z1.h }, p3/Z, [x8]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (4, 5) + "addvl x28, x28, #1\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "fmla z29.h, p3/M, z2.h, z5.h\n" + "fmla z28.h, p3/M, z2.h, z6.h\n" + "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (5, 0) + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z6.h\n" + "fmla z28.h, p3/M, z3.h, z8.h\n" + "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n" // Load input point (5, 1) + "fmla z30.h, p3/M, z4.h, z14.h\n" + "ld1h { z14.h }, p1/Z, [x10]\n" // Load input point (2, 0) + "fmla z29.h, p3/M, z4.h, z8.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n" // Load input point (5, 2) + "fmla z30.h, p3/M, z0.h, z13.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (5, 3) + "fmla z28.h, p3/M, z0.h, z12.h\n" + "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z13.h\n" + "ld1h { z13.h }, p1/Z, [x20, x26, LSL #1]\n" // Load input point (1, 2) + "fmla z30.h, p3/M, z1.h, z5.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (5, 4) + "fmla z28.h, p3/M, z1.h, z9.h\n" + "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z5.h\n" + "ld1h { z5.h }, p1/Z, [x13]\n" // Load input point (0, 0) + "fmla z30.h, p3/M, z2.h, z6.h\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (5, 5) + "whilelt p2.h, x17, %x[n_channels]\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias + "addvl x27, x27, #1\n" + "fmla z31.h, p3/M, z3.h, z6.h\n" + "ld1h { z6.h }, p1/Z, [x13, x15, LSL #1]\n" // Load input point (0, 1) + "addvl x8, x8, #16\n" + "fmla z30.h, p3/M, z3.h, z8.h\n" + "cmp x16, %x[n_channels]\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p1/Z, [x13, x25, LSL #1]\n" // Load input point (0, 3) + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z8.h\n" + "ld1h { z8.h }, p1/Z, [x20, x15, LSL #1]\n" // Load input point (1, 1) + "fmla z30.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p1/Z, [x20, x23, LSL #1]\n" // Load input point (1, 5) + "fmla z29.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x13, x24, LSL #1]\n" // Load input point (0, 4) + "fmla z28.h, p3/M, z4.h, z9.h\n" + "ld1h { z9.h }, p1/Z, [x13, x26, LSL #1]\n" // Load input point (0, 2) + "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias + "fmax z31.h, p3/M, z31.h, z18.h\n" + "addvl x8, x8, #-6\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x11]\n" // Store output point (0, 0) + "mov z31.d, z16.d\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "st1h { z30.h }, p0, [x11, x12, LSL #1]\n" // Store output point (0, 1) + "mov z30.d, z16.d\n" + "addvl x11, x11, #1\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0) + "mov z29.d, z16.d\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z28.h }, p0, [x22, x12, LSL #1]\n" // Store output point (1, 1) + "mov z28.d, z16.d\n" + "addvl x22, x22, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x20, x25, LSL #1]\n" // Load input point (1, 3) + "mov p0.b, p2.b\n" + "fmla z30.h, p3/M, z0.h, z6.h\n" + "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x5, #0x1\n" + "fmla z29.h, p3/M, z0.h, z7.h\n" + "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "fmla z28.h, p3/M, z0.h, z8.h\n" + "ld1h { z0.h }, p3/Z, [x8]\n" // Load from weights and bias + "add x6, x6, #0x1\n" + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x20, x24, LSL #1]\n" // Load input point (1, 4) + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z29.h, p3/M, z1.h, z8.h\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x6, x19\n" + "fmla z28.h, p3/M, z1.h, z13.h\n" + "ld1h { z1.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x13, x23, LSL #1]\n" // Load input point (0, 5) + "csel x6, x6, XZR, LT\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "csel x5, x5, x21, LT\n" + "fmla z29.h, p3/M, z2.h, z13.h\n" + "cmp x5, x20\n" + "fmla z28.h, p3/M, z2.h, z5.h\n" + "ld1h { z2.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x10, x15, LSL #1]\n" // Load input point (2, 1) + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z5.h\n" + "fmla z28.h, p3/M, z3.h, z6.h\n" + "ld1h { z3.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10, x26, LSL #1]\n" // Load input point (2, 2) + "fmla z30.h, p3/M, z4.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x10, x25, LSL #1]\n" // Load input point (2, 3) + "fmla z29.h, p3/M, z4.h, z6.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x8, #4, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z7.h\n" + "fmla z30.h, p3/M, z0.h, z8.h\n" + "fmla z29.h, p3/M, z0.h, z14.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x8, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x10, x23, LSL #1]\n" // Load input point (2, 5) + "fmla z30.h, p3/M, z1.h, z13.h\n" + "fmla z29.h, p3/M, z1.h, z11.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x8, #6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x10, x24, LSL #1]\n" // Load input point (2, 4) + "fmla z30.h, p3/M, z2.h, z5.h\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "fmla z28.h, p3/M, z2.h, z9.h\n" + "ld1h { z2.h }, p3/Z, [x8, #7, MUL VL]\n" // Load from weights and bias + "addvl x8, x8, #16\n" + "fmla z31.h, p3/M, z3.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x9]\n" // Load input point (3, 0) + "fmla z30.h, p3/M, z3.h, z6.h\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "ld1h { z3.h }, p3/Z, [x8, #-8, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x9, x15, LSL #1]\n" // Load input point (3, 1) + "fmla z30.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x9, x26, LSL #1]\n" // Load input point (3, 2) + "fmla z29.h, p3/M, z4.h, z13.h\n" + "fmla z28.h, p3/M, z4.h, z8.h\n" + "ld1h { z4.h }, p3/Z, [x8, #-7, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x9, x23, LSL #1]\n" // Load input point (3, 5) + "fmla z30.h, p3/M, z0.h, z11.h\n" + "fmla z29.h, p3/M, z0.h, z5.h\n" + "fmla z28.h, p3/M, z0.h, z6.h\n" + "ld1h { z0.h }, p3/Z, [x8, #-6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x9, x25, LSL #1]\n" // Load input point (3, 3) + "fmla z30.h, p3/M, z1.h, z12.h\n" + "fmla z29.h, p3/M, z1.h, z6.h\n" + "fmla z28.h, p3/M, z1.h, z10.h\n" + "ld1h { z1.h }, p3/Z, [x8, #-5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x9, x24, LSL #1]\n" // Load input point (3, 4) + "fmla z30.h, p3/M, z2.h, z9.h\n" + "fmla z29.h, p3/M, z2.h, z10.h\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x8, #-4, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x28]\n" // Load input point (4, 0) + "fmla z30.h, p3/M, z3.h, z13.h\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x8, #-3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x28, x15, LSL #1]\n" // Load input point (4, 1) + "fmla z30.h, p3/M, z4.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x28, x24, LSL #1]\n" // Load input point (4, 4) + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z4.h, z14.h\n" + "ld1h { z4.h }, p3/Z, [x8, #-2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x28, x26, LSL #1]\n" // Load input point (4, 2) + "fmla z30.h, p3/M, z0.h, z6.h\n" + "fmla z29.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z0.h, z13.h\n" + "ld1h { z0.h }, p3/Z, [x8, #-1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x28, x25, LSL #1]\n" // Load input point (4, 3) + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z1.h, z13.h\n" + "fmla z28.h, p3/M, z1.h, z5.h\n" + "ld1h { z1.h }, p3/Z, [x8]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x28, x23, LSL #1]\n" // Load input point (4, 5) + "fmla z30.h, p3/M, z2.h, z11.h\n" + "fmla z29.h, p3/M, z2.h, z5.h\n" + "fmla z28.h, p3/M, z2.h, z6.h\n" + "ld1h { z2.h }, p3/Z, [x8, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27]\n" // Load input point (5, 0) + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z6.h\n" + "fmla z28.h, p3/M, z3.h, z8.h\n" + "ld1h { z3.h }, p3/Z, [x8, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n" // Load input point (5, 1) + "fmla z30.h, p3/M, z4.h, z14.h\n" + "fmla z29.h, p3/M, z4.h, z8.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x8, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x27, x26, LSL #1]\n" // Load input point (5, 2) + "fmla z30.h, p3/M, z0.h, z13.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x27, x25, LSL #1]\n" // Load input point (5, 3) + "fmla z28.h, p3/M, z0.h, z12.h\n" + "fmla z31.h, p3/M, z1.h, z13.h\n" + "fmla z30.h, p3/M, z1.h, z5.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x24, LSL #1]\n" // Load input point (5, 4) + "fmla z28.h, p3/M, z1.h, z9.h\n" + "fmla z31.h, p3/M, z2.h, z5.h\n" + "fmla z30.h, p3/M, z2.h, z6.h\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x27, x23, LSL #1]\n" // Load input point (5, 5) + "fmla z28.h, p3/M, z2.h, z11.h\n" + "fmla z31.h, p3/M, z3.h, z6.h\n" + "fmla z30.h, p3/M, z3.h, z8.h\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z31.h, p3/M, z4.h, z8.h\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z4.h, z9.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x11]\n" // Store output point (0, 0) + "fmin z30.h, p3/M, z30.h, z17.h\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x11, x12, LSL #1]\n" // Store output point (0, 1) + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z29.h }, p0, [x22]\n" // Store output point (1, 0) + "st1h { z28.h }, p0, [x22, x12, LSL #1]\n" // Store output point (1, 1) + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..caa15a9816 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,559 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) + +namespace arm_conv { +namespace depthwise { + +void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *params, + unsigned int n_channels, + const __fp16 activation_min, + const __fp16 activation_max +) +{ + struct Args + { + __fp16 *const *outptrs; + const void *params; + const __fp16 min, max; + const __fp16 *inptrs[36]; + + Args( + const __fp16 *const *const input_ptrs, + __fp16 *const *const outptrs, + const void *const params, + const __fp16 min, + const __fp16 max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[0]; + inptrs[1] = input_ptrs[1]; + inptrs[2] = input_ptrs[6]; + inptrs[3] = input_ptrs[7]; + inptrs[4] = input_ptrs[2]; + inptrs[5] = input_ptrs[8]; + inptrs[6] = input_ptrs[3]; + inptrs[7] = input_ptrs[4]; + inptrs[8] = input_ptrs[11]; + inptrs[9] = input_ptrs[12]; + inptrs[10] = input_ptrs[9]; + inptrs[11] = input_ptrs[10]; + inptrs[12] = input_ptrs[5]; + inptrs[13] = input_ptrs[13]; + inptrs[14] = input_ptrs[14]; + inptrs[15] = input_ptrs[15]; + inptrs[16] = input_ptrs[16]; + inptrs[17] = input_ptrs[17]; + inptrs[18] = input_ptrs[18]; + inptrs[19] = input_ptrs[19]; + inptrs[20] = input_ptrs[20]; + inptrs[21] = input_ptrs[21]; + inptrs[22] = input_ptrs[22]; + inptrs[23] = input_ptrs[23]; + inptrs[24] = input_ptrs[24]; + inptrs[25] = input_ptrs[25]; + inptrs[26] = input_ptrs[26]; + inptrs[27] = input_ptrs[27]; + inptrs[28] = input_ptrs[28]; + inptrs[29] = input_ptrs[29]; + inptrs[30] = input_ptrs[30]; + inptrs[31] = input_ptrs[31]; + inptrs[32] = input_ptrs[32]; + inptrs[33] = input_ptrs[33]; + inptrs[34] = input_ptrs[34]; + inptrs[35] = input_ptrs[35]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n" + "add x6, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "mov x7, #0x0\n" + "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "cnth x8\n" + "ldp x17, x16, [x19, #0x0]\n" + "sub x15, XZR, x8\n" + "ldp x14, x13, [x19, #0x10]\n" + "whilelt p2.h, XZR, %x[n_channels]\n" + "ld1h { z16.h }, p3/Z, [x5]\n" // Load from weights and bias + "mov z31.d, z16.d\n" + "ld1h { z0.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias + "cmp x8, %x[n_channels]\n" + "mov z30.d, z16.d\n" + "ld1h { z1.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias + "mov z29.d, z16.d\n" + "ld1h { z2.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias + "mov z28.d, z16.d\n" + "ld1h { z3.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias + "ld1h { z4.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias + "addvl x5, x5, #6\n" + "ldp x12, x11, [x6, #0x0]\n" + "ldp x10, x9, [x6, #0x10]\n" + "ldp x20, x28, [x6, #0x20]\n" + "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n" + "ld1h { z6.h }, p2/Z, [x11, x7, LSL #1]\n" + "ld1h { z7.h }, p2/Z, [x10, x7, LSL #1]\n" + "ld1h { z8.h }, p2/Z, [x9, x7, LSL #1]\n" + "ld1h { z9.h }, p2/Z, [x20, x7, LSL #1]\n" + "ld1h { z13.h }, p2/Z, [x28, x7, LSL #1]\n" + "ldp x27, x19, [x6, #0x30]\n" + "ldp x26, x25, [x6, #0x40]\n" + "ld1h { z11.h }, p2/Z, [x27, x7, LSL #1]\n" + "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n" + "ld1h { z10.h }, p2/Z, [x26, x7, LSL #1]\n" + "ld1h { z14.h }, p2/Z, [x25, x7, LSL #1]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ldr x24, [x6, #0x50]\n" + "whilelt p1.h, x8, %x[n_channels]\n" + "fmla z30.h, p3/M, z0.h, z6.h\n" + "ldr x23, [x6, #0x58]\n" + "inch x15\n" + "fmla z29.h, p3/M, z0.h, z7.h\n" + "ldr x22, [x6, #0x60]\n" + "mov p0.b, p2.b\n" + "fmla z28.h, p3/M, z0.h, z8.h\n" + "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n" + "ld1h { z0.h }, p3/Z, [x5]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ldr x21, [x6, #0x68]\n" + "fmla z29.h, p3/M, z1.h, z8.h\n" + "ldr x20, [x6, #0x70]\n" + "fmla z28.h, p3/M, z1.h, z13.h\n" + "ld1h { z1.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x22, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "ldr x19, [x6, #0x78]\n" + "fmla z29.h, p3/M, z2.h, z13.h\n" + "ldr x12, [x6, #0x80]\n" + "fmla z28.h, p3/M, z2.h, z5.h\n" + "ld1h { z2.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x21, x7, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "ldr x11, [x6, #0x88]\n" + "fmla z29.h, p3/M, z3.h, z5.h\n" + "ldr x10, [x6, #0x90]\n" + "fmla z28.h, p3/M, z3.h, z6.h\n" + "ld1h { z3.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x20, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x19, x7, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z6.h\n" + "ldr x9, [x6, #0x98]\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z7.h\n" + "ldr x20, [x6, #0xa0]\n" + "fmla z30.h, p3/M, z0.h, z8.h\n" + "ldr x28, [x6, #0xa8]\n" + "fmla z29.h, p3/M, z0.h, z14.h\n" + "ldr x27, [x6, #0xb0]\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x11, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z13.h\n" + "ldr x19, [x6, #0xb8]\n" + "fmla z29.h, p3/M, z1.h, z11.h\n" + "ldr x26, [x6, #0xc0]\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x12, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z5.h\n" + "ldr x25, [x6, #0xc8]\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "ldr x24, [x6, #0xd0]\n" + "fmla z28.h, p3/M, z2.h, z9.h\n" + "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z5.h\n" + "addvl x5, x5, #16\n" + "fmla z30.h, p3/M, z3.h, z6.h\n" + "ld1h { z5.h }, p2/Z, [x10, x7, LSL #1]\n" + "ldr x23, [x6, #0xd8]\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "ldr x22, [x6, #0xe0]\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x9, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x20, x7, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z13.h\n" + "ldr x21, [x6, #0xe8]\n" + "fmla z28.h, p3/M, z4.h, z8.h\n" + "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x19, x7, LSL #1]\n" + "fmla z30.h, p3/M, z0.h, z11.h\n" + "ldr x20, [x6, #0xf0]\n" + "fmla z29.h, p3/M, z0.h, z5.h\n" + "ldr x19, [x6, #0xf8]\n" + "fmla z28.h, p3/M, z0.h, z6.h\n" + "ld1h { z0.h }, p3/Z, [x5, #-6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z12.h\n" + "ldr x12, [x6, #0x100]\n" + "fmla z29.h, p3/M, z1.h, z6.h\n" + "ldr x11, [x6, #0x108]\n" + "fmla z28.h, p3/M, z1.h, z10.h\n" + "ld1h { z1.h }, p3/Z, [x5, #-5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z9.h\n" + "ldr x10, [x6, #0x110]\n" + "fmla z29.h, p3/M, z2.h, z10.h\n" + "ldr x9, [x6, #0x118]\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x5, #-4, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x26, x7, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z13.h\n" + "ld1h { z16.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias + "fmla z29.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x5, #-3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x25, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x22, x7, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z4.h, z14.h\n" + "ld1h { z4.h }, p3/Z, [x5, #-2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n" + "fmla z30.h, p3/M, z0.h, z6.h\n" + "fmla z29.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z0.h, z13.h\n" + "ld1h { z0.h }, p3/Z, [x5, #-1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z1.h, z13.h\n" + "fmla z28.h, p3/M, z1.h, z5.h\n" + "ld1h { z1.h }, p3/Z, [x5]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x21, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "fmla z29.h, p3/M, z2.h, z5.h\n" + "fmla z28.h, p3/M, z2.h, z6.h\n" + "ld1h { z2.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x20, x7, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z6.h\n" + "fmla z28.h, p3/M, z3.h, z8.h\n" + "ld1h { z3.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z14.h\n" + "fmla z29.h, p3/M, z4.h, z8.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n" + "fmla z30.h, p3/M, z0.h, z13.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" + "ldp x12, x11, [x6, #0x0]\n" + "fmla z28.h, p3/M, z0.h, z12.h\n" + "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z13.h\n" + "fmla z30.h, p3/M, z1.h, z5.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10, x7, LSL #1]\n" + "fmla z28.h, p3/M, z1.h, z9.h\n" + "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z5.h\n" + "ld1h { z5.h }, p1/Z, [x12, x8, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z6.h\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x9, x7, LSL #1]\n" + "inch x7\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ldp x10, x9, [x6, #0x10]\n" + "whilelt p2.h, x7, %x[n_channels]\n" + "fmla z31.h, p3/M, z3.h, z6.h\n" + "ld1h { z6.h }, p1/Z, [x11, x8, LSL #1]\n" + "ldp x20, x28, [x6, #0x20]\n" + "fmla z30.h, p3/M, z3.h, z8.h\n" + "ldp x27, x19, [x6, #0x30]\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "ld1h { z7.h }, p1/Z, [x10, x8, LSL #1]\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z13.h }, p1/Z, [x28, x8, LSL #1]\n" + "fmla z31.h, p3/M, z4.h, z8.h\n" + "ld1h { z8.h }, p1/Z, [x9, x8, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "ld1h { z11.h }, p1/Z, [x27, x8, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p1/Z, [x19, x8, LSL #1]\n" + "fmla z28.h, p3/M, z4.h, z9.h\n" + "ld1h { z9.h }, p1/Z, [x20, x8, LSL #1]\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "ldp x26, x25, [x6, #0x40]\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias + "fmax z29.h, p3/M, z29.h, z18.h\n" + "addvl x5, x5, #16\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "ld1h { z10.h }, p1/Z, [x26, x8, LSL #1]\n" + "ld1h { z14.h }, p1/Z, [x25, x8, LSL #1]\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "inch x8\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias + "cmp x8, %x[n_channels]\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias + "addvl x5, x5, #-6\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z31.h }, p0, [x17, x15, LSL #1]\n" + "mov z31.d, z16.d\n" + "st1h { z30.h }, p0, [x16, x15, LSL #1]\n" + "mov z30.d, z16.d\n" + "st1h { z29.h }, p0, [x14, x15, LSL #1]\n" + "mov z29.d, z16.d\n" + "st1h { z28.h }, p0, [x13, x15, LSL #1]\n" + "mov z28.d, z16.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ldr x24, [x6, #0x50]\n" + "inch x15\n" + "fmla z30.h, p3/M, z0.h, z6.h\n" + "ldr x23, [x6, #0x58]\n" + "mov p0.b, p2.b\n" + "fmla z29.h, p3/M, z0.h, z7.h\n" + "ldr x22, [x6, #0x60]\n" + "fmla z28.h, p3/M, z0.h, z8.h\n" + "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n" + "ld1h { z0.h }, p3/Z, [x5]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z9.h\n" + "ldr x21, [x6, #0x68]\n" + "fmla z29.h, p3/M, z1.h, z8.h\n" + "fmla z28.h, p3/M, z1.h, z13.h\n" + "ld1h { z1.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias + "ldr x20, [x6, #0x70]\n" + "fmla z31.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x22, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "ldr x19, [x6, #0x78]\n" + "fmla z29.h, p3/M, z2.h, z13.h\n" + "fmla z28.h, p3/M, z2.h, z5.h\n" + "ld1h { z2.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias + "ldr x12, [x6, #0x80]\n" + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x21, x7, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "ldr x11, [x6, #0x88]\n" + "fmla z29.h, p3/M, z3.h, z5.h\n" + "fmla z28.h, p3/M, z3.h, z6.h\n" + "ld1h { z3.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias + "ldr x10, [x6, #0x90]\n" + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x20, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x19, x7, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z6.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x5, #4, MUL VL]\n" // Load from weights and bias + "ldr x9, [x6, #0x98]\n" + "fmla z31.h, p3/M, z0.h, z7.h\n" + "ldr x20, [x6, #0xa0]\n" + "fmla z30.h, p3/M, z0.h, z8.h\n" + "ldr x28, [x6, #0xa8]\n" + "fmla z29.h, p3/M, z0.h, z14.h\n" + "fmla z28.h, p3/M, z0.h, z11.h\n" + "ld1h { z0.h }, p3/Z, [x5, #5, MUL VL]\n" // Load from weights and bias + "ldr x27, [x6, #0xb0]\n" + "fmla z31.h, p3/M, z1.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x11, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z13.h\n" + "ldr x19, [x6, #0xb8]\n" + "fmla z29.h, p3/M, z1.h, z11.h\n" + "fmla z28.h, p3/M, z1.h, z12.h\n" + "ld1h { z1.h }, p3/Z, [x5, #6, MUL VL]\n" // Load from weights and bias + "ldr x26, [x6, #0xc0]\n" + "fmla z31.h, p3/M, z2.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x12, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z5.h\n" + "ldr x25, [x6, #0xc8]\n" + "fmla z29.h, p3/M, z2.h, z12.h\n" + "fmla z28.h, p3/M, z2.h, z9.h\n" + "ld1h { z2.h }, p3/Z, [x5, #7, MUL VL]\n" // Load from weights and bias + "addvl x5, x5, #16\n" + "fmla z31.h, p3/M, z3.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x10, x7, LSL #1]\n" + "ldr x24, [x6, #0xd0]\n" + "fmla z30.h, p3/M, z3.h, z6.h\n" + "ldr x23, [x6, #0xd8]\n" + "fmla z29.h, p3/M, z3.h, z9.h\n" + "fmla z28.h, p3/M, z3.h, z13.h\n" + "ld1h { z3.h }, p3/Z, [x5, #-8, MUL VL]\n" // Load from weights and bias + "ldr x22, [x6, #0xe0]\n" + "fmla z31.h, p3/M, z4.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x9, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x20, x7, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z13.h\n" + "fmla z28.h, p3/M, z4.h, z8.h\n" + "ld1h { z4.h }, p3/Z, [x5, #-7, MUL VL]\n" // Load from weights and bias + "ldr x21, [x6, #0xe8]\n" + "fmla z31.h, p3/M, z0.h, z14.h\n" + "ld1h { z14.h }, p2/Z, [x19, x7, LSL #1]\n" + "fmla z30.h, p3/M, z0.h, z11.h\n" + "ldr x20, [x6, #0xf0]\n" + "fmla z29.h, p3/M, z0.h, z5.h\n" + "fmla z28.h, p3/M, z0.h, z6.h\n" + "ld1h { z0.h }, p3/Z, [x5, #-6, MUL VL]\n" // Load from weights and bias + "ldr x19, [x6, #0xf8]\n" + "fmla z31.h, p3/M, z1.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x28, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z12.h\n" + "ldr x12, [x6, #0x100]\n" + "fmla z29.h, p3/M, z1.h, z6.h\n" + "fmla z28.h, p3/M, z1.h, z10.h\n" + "ld1h { z1.h }, p3/Z, [x5, #-5, MUL VL]\n" // Load from weights and bias + "ldr x11, [x6, #0x108]\n" + "fmla z31.h, p3/M, z2.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x27, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z9.h\n" + "ldr x10, [x6, #0x110]\n" + "fmla z29.h, p3/M, z2.h, z10.h\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "ld1h { z2.h }, p3/Z, [x5, #-4, MUL VL]\n" // Load from weights and bias + "ldr x9, [x6, #0x118]\n" + "fmla z31.h, p3/M, z3.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x26, x7, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z13.h\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "ld1h { z3.h }, p3/Z, [x5, #-3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z13.h\n" + "ld1h { z13.h }, p2/Z, [x25, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z8.h\n" + "ld1h { z8.h }, p2/Z, [x22, x7, LSL #1]\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z4.h, z14.h\n" + "ld1h { z4.h }, p3/Z, [x5, #-2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z5.h\n" + "ld1h { z5.h }, p2/Z, [x24, x7, LSL #1]\n" + "fmla z30.h, p3/M, z0.h, z6.h\n" + "fmla z29.h, p3/M, z0.h, z9.h\n" + "fmla z28.h, p3/M, z0.h, z13.h\n" + "ld1h { z0.h }, p3/Z, [x5, #-1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z1.h, z6.h\n" + "ld1h { z6.h }, p2/Z, [x23, x7, LSL #1]\n" + "fmla z30.h, p3/M, z1.h, z10.h\n" + "fmla z29.h, p3/M, z1.h, z13.h\n" + "fmla z28.h, p3/M, z1.h, z5.h\n" + "ld1h { z1.h }, p3/Z, [x5]\n" // Load from weights and bias + "fmla z31.h, p3/M, z2.h, z10.h\n" + "ld1h { z10.h }, p2/Z, [x21, x7, LSL #1]\n" + "fmla z30.h, p3/M, z2.h, z11.h\n" + "fmla z29.h, p3/M, z2.h, z5.h\n" + "fmla z28.h, p3/M, z2.h, z6.h\n" + "ld1h { z2.h }, p3/Z, [x5, #1, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z3.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x20, x7, LSL #1]\n" + "fmla z30.h, p3/M, z3.h, z12.h\n" + "fmla z29.h, p3/M, z3.h, z6.h\n" + "fmla z28.h, p3/M, z3.h, z8.h\n" + "ld1h { z3.h }, p3/Z, [x5, #2, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z4.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x19, x7, LSL #1]\n" + "fmla z30.h, p3/M, z4.h, z14.h\n" + "fmla z29.h, p3/M, z4.h, z8.h\n" + "fmla z28.h, p3/M, z4.h, z10.h\n" + "ld1h { z4.h }, p3/Z, [x5, #3, MUL VL]\n" // Load from weights and bias + "fmla z31.h, p3/M, z0.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n" + "fmla z30.h, p3/M, z0.h, z13.h\n" + "fmla z29.h, p3/M, z0.h, z11.h\n" + "ld1h { z11.h }, p2/Z, [x11, x7, LSL #1]\n" + "fmla z28.h, p3/M, z0.h, z12.h\n" + "fmla z31.h, p3/M, z1.h, z13.h\n" + "fmla z30.h, p3/M, z1.h, z5.h\n" + "fmla z29.h, p3/M, z1.h, z12.h\n" + "ld1h { z12.h }, p2/Z, [x10, x7, LSL #1]\n" + "fmla z28.h, p3/M, z1.h, z9.h\n" + "fmla z31.h, p3/M, z2.h, z5.h\n" + "fmla z30.h, p3/M, z2.h, z6.h\n" + "fmla z29.h, p3/M, z2.h, z9.h\n" + "ld1h { z9.h }, p2/Z, [x9, x7, LSL #1]\n" + "fmla z28.h, p3/M, z2.h, z11.h\n" + "fmla z31.h, p3/M, z3.h, z6.h\n" + "fmla z30.h, p3/M, z3.h, z8.h\n" + "fmla z29.h, p3/M, z3.h, z11.h\n" + "fmla z28.h, p3/M, z3.h, z12.h\n" + "fmla z31.h, p3/M, z4.h, z8.h\n" + "fmla z30.h, p3/M, z4.h, z10.h\n" + "fmla z29.h, p3/M, z4.h, z12.h\n" + "fmla z28.h, p3/M, z4.h, z9.h\n" + "fmax z31.h, p3/M, z31.h, z18.h\n" + "fmax z30.h, p3/M, z30.h, z18.h\n" + "fmax z29.h, p3/M, z29.h, z18.h\n" + "fmax z28.h, p3/M, z28.h, z18.h\n" + "fmin z31.h, p3/M, z31.h, z17.h\n" + "st1h { z31.h }, p0, [x17, x15, LSL #1]\n" + "fmin z30.h, p3/M, z30.h, z17.h\n" + "fmin z29.h, p3/M, z29.h, z17.h\n" + "st1h { z30.h }, p0, [x16, x15, LSL #1]\n" + "fmin z28.h, p3/M, z28.h, z17.h\n" + "st1h { z29.h }, p0, [x14, x15, LSL #1]\n" + "st1h { z28.h }, p0, [x13, x15, LSL #1]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..74716ddf1f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + + sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..d443855758 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + const float *const inptrs[16] = { + input_ptrs[0], input_ptrs[1], input_ptrs[4], input_ptrs[5], input_ptrs[2], input_ptrs[6], input_ptrs[3], input_ptrs[7], input_ptrs[8], input_ptrs[9], input_ptrs[10], input_ptrs[11], input_ptrs[12], input_ptrs[13], input_ptrs[14], input_ptrs[15], + }; + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ldp x26, x23, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "ldp x25, x16, [%x[inptrs], #0x10]\n" + "mov x15, #0x0\n" + "ld1w { z15.s }, p2/Z, [%x[params]]\n" + "mov z14.d, z15.d\n" + "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "cntw x14\n" + "mov z12.d, z15.d\n" + "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "sub x13, XZR, x14\n" + "mov z10.d, z15.d\n" + "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "whilelt p1.s, XZR, %x[n_channels]\n" + "mov z8.d, z15.d\n" + "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "cmp x14, %x[n_channels]\n" + "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n" + "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n" + "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n" + "addvl %x[params], %x[params], #-6\n" + "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n" + "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n" + "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n" + "ldp x24, x12, [%x[inptrs], #0x20]\n" + "ldp x23, x11, [%x[inptrs], #0x30]\n" + "ldp x10, x9, [%x[inptrs], #0x40]\n" + "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n" + "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n" + "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n" + "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n" + "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n" + "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n" + "ldp x28, x27, [%x[inptrs], #0x50]\n" + "ldp x26, x25, [%x[inptrs], #0x60]\n" + "ldp x24, x23, [%x[inptrs], #0x70]\n" + "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n" + "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n" + "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n" + "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n" + "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n" + "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n" + "ldp x22, x21, [%x[outptrs], #0x0]\n" + "ldp x20, x19, [%x[outptrs], #0x10]\n" + "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n" + "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n" + "bge 1f\n" + "1:" // Loop + "fmla z14.s, p2/M, z13.s, z3.s\n" + "ld1w { z15.s }, p2/Z, [%x[params]]\n" + "incw x13\n" + "fmla z12.s, p2/M, z13.s, z0.s\n" + "ldp x26, x23, [%x[inptrs], #0x0]\n" + "mov p0.b, p1.b\n" + "fmla z10.s, p2/M, z13.s, z31.s\n" + "ldp x25, x16, [%x[inptrs], #0x10]\n" + "mov x15, x14\n" + "fmla z8.s, p2/M, z13.s, z30.s\n" + "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "incw x14\n" + "fmla z14.s, p2/M, z11.s, z0.s\n" + "ldp x24, x12, [%x[inptrs], #0x20]\n" + "whilelt p1.s, x15, %x[n_channels]\n" + "fmla z12.s, p2/M, z11.s, z29.s\n" + "ld1w { z3.s }, p1/Z, [x26, x15, LSL #2]\n" + "cmp x14, %x[n_channels]\n" + "fmla z10.s, p2/M, z11.s, z30.s\n" + "ld1w { z0.s }, p1/Z, [x23, x15, LSL #2]\n" + "ldp x23, x11, [%x[inptrs], #0x30]\n" + "fmla z8.s, p2/M, z11.s, z28.s\n" + "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "fmla z14.s, p2/M, z9.s, z29.s\n" + "ld1w { z29.s }, p1/Z, [x24, x15, LSL #2]\n" + "fmla z12.s, p2/M, z9.s, z27.s\n" + "ld1w { z27.s }, p1/Z, [x23, x15, LSL #2]\n" + "fmla z10.s, p2/M, z9.s, z28.s\n" + "ldp x10, x9, [%x[inptrs], #0x40]\n" + "fmla z8.s, p2/M, z9.s, z26.s\n" + "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "fmla z14.s, p2/M, z7.s, z31.s\n" + "ld1w { z31.s }, p1/Z, [x25, x15, LSL #2]\n" + "fmla z12.s, p2/M, z7.s, z30.s\n" + "ldp x28, x27, [%x[inptrs], #0x50]\n" + "fmla z10.s, p2/M, z7.s, z25.s\n" + "ldp x26, x25, [%x[inptrs], #0x60]\n" + "fmla z8.s, p2/M, z7.s, z24.s\n" + "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "fmla z14.s, p2/M, z6.s, z30.s\n" + "ld1w { z30.s }, p1/Z, [x16, x15, LSL #2]\n" + "fmla z12.s, p2/M, z6.s, z28.s\n" + "ldp x24, x23, [%x[inptrs], #0x70]\n" + "fmla z10.s, p2/M, z6.s, z24.s\n" + "fmla z8.s, p2/M, z6.s, z23.s\n" + "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "fmla z14.s, p2/M, z5.s, z28.s\n" + "ld1w { z28.s }, p1/Z, [x12, x15, LSL #2]\n" + "fmla z12.s, p2/M, z5.s, z26.s\n" + "ld1w { z26.s }, p1/Z, [x11, x15, LSL #2]\n" + "fmla z10.s, p2/M, z5.s, z23.s\n" + "fmla z8.s, p2/M, z5.s, z22.s\n" + "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "fmla z14.s, p2/M, z4.s, z25.s\n" + "ld1w { z25.s }, p1/Z, [x10, x15, LSL #2]\n" + "fmla z12.s, p2/M, z4.s, z24.s\n" + "fmla z10.s, p2/M, z4.s, z21.s\n" + "ld1w { z21.s }, p1/Z, [x26, x15, LSL #2]\n" + "fmla z8.s, p2/M, z4.s, z20.s\n" + "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "fmla z14.s, p2/M, z2.s, z24.s\n" + "ld1w { z24.s }, p1/Z, [x9, x15, LSL #2]\n" + "fmla z12.s, p2/M, z2.s, z23.s\n" + "fmla z10.s, p2/M, z2.s, z20.s\n" + "ld1w { z20.s }, p1/Z, [x25, x15, LSL #2]\n" + "fmla z8.s, p2/M, z2.s, z19.s\n" + "ld1w { z2.s }, p2/Z, [%x[params], #-8, MUL VL]\n" + "fmla z14.s, p2/M, z1.s, z23.s\n" + "ld1w { z23.s }, p1/Z, [x28, x15, LSL #2]\n" + "fmla z12.s, p2/M, z1.s, z22.s\n" + "ld1w { z22.s }, p1/Z, [x27, x15, LSL #2]\n" + "fmla z10.s, p2/M, z1.s, z19.s\n" + "ld1w { z19.s }, p1/Z, [x24, x15, LSL #2]\n" + "fmla z8.s, p2/M, z1.s, z18.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #-7, MUL VL]\n" + "addvl %x[params], %x[params], #-6\n" + "fmax z14.s, p2/M, z14.s, z17.s\n" + "ld1w { z18.s }, p1/Z, [x23, x15, LSL #2]\n" + "fmax z12.s, p2/M, z12.s, z17.s\n" + "fmax z10.s, p2/M, z10.s, z17.s\n" + "fmax z8.s, p2/M, z8.s, z17.s\n" + "fmin z14.s, p2/M, z14.s, z16.s\n" + "st1w { z14.s }, p0, [x22, x13, LSL #2]\n" + "mov z14.d, z15.d\n" + "fmin z12.s, p2/M, z12.s, z16.s\n" + "st1w { z12.s }, p0, [x21, x13, LSL #2]\n" + "mov z12.d, z15.d\n" + "fmin z10.s, p2/M, z10.s, z16.s\n" + "st1w { z10.s }, p0, [x20, x13, LSL #2]\n" + "mov z10.d, z15.d\n" + "fmin z8.s, p2/M, z8.s, z16.s\n" + "st1w { z8.s }, p0, [x19, x13, LSL #2]\n" + "mov z8.d, z15.d\n" + "blt 1b\n" + "2:" // Tail + "fmla z14.s, p2/M, z13.s, z3.s\n" + "incw x13\n" + "fmla z12.s, p2/M, z13.s, z0.s\n" + "mov p0.b, p1.b\n" + "fmla z10.s, p2/M, z13.s, z31.s\n" + "fmla z8.s, p2/M, z13.s, z30.s\n" + "fmla z14.s, p2/M, z11.s, z0.s\n" + "fmla z12.s, p2/M, z11.s, z29.s\n" + "fmla z10.s, p2/M, z11.s, z30.s\n" + "fmla z8.s, p2/M, z11.s, z28.s\n" + "fmla z14.s, p2/M, z9.s, z29.s\n" + "fmla z12.s, p2/M, z9.s, z27.s\n" + "fmla z10.s, p2/M, z9.s, z28.s\n" + "fmla z8.s, p2/M, z9.s, z26.s\n" + "fmla z14.s, p2/M, z7.s, z31.s\n" + "fmla z12.s, p2/M, z7.s, z30.s\n" + "fmla z10.s, p2/M, z7.s, z25.s\n" + "fmla z8.s, p2/M, z7.s, z24.s\n" + "fmla z14.s, p2/M, z6.s, z30.s\n" + "fmla z12.s, p2/M, z6.s, z28.s\n" + "fmla z10.s, p2/M, z6.s, z24.s\n" + "fmla z8.s, p2/M, z6.s, z23.s\n" + "fmla z14.s, p2/M, z5.s, z28.s\n" + "fmla z12.s, p2/M, z5.s, z26.s\n" + "fmla z10.s, p2/M, z5.s, z23.s\n" + "fmla z8.s, p2/M, z5.s, z22.s\n" + "fmla z14.s, p2/M, z4.s, z25.s\n" + "fmla z12.s, p2/M, z4.s, z24.s\n" + "fmla z10.s, p2/M, z4.s, z21.s\n" + "fmla z8.s, p2/M, z4.s, z20.s\n" + "fmla z14.s, p2/M, z2.s, z24.s\n" + "fmla z12.s, p2/M, z2.s, z23.s\n" + "fmla z10.s, p2/M, z2.s, z20.s\n" + "fmla z8.s, p2/M, z2.s, z19.s\n" + "fmla z14.s, p2/M, z1.s, z23.s\n" + "fmla z12.s, p2/M, z1.s, z22.s\n" + "fmla z10.s, p2/M, z1.s, z19.s\n" + "fmla z8.s, p2/M, z1.s, z18.s\n" + "fmax z14.s, p2/M, z14.s, z17.s\n" + "fmax z12.s, p2/M, z12.s, z17.s\n" + "fmax z10.s, p2/M, z10.s, z17.s\n" + "fmax z8.s, p2/M, z8.s, z17.s\n" + "fmin z14.s, p2/M, z14.s, z16.s\n" + "st1w { z14.s }, p0, [x22, x13, LSL #2]\n" + "fmin z12.s, p2/M, z12.s, z16.s\n" + "fmin z10.s, p2/M, z10.s, z16.s\n" + "st1w { z12.s }, p0, [x21, x13, LSL #2]\n" + "fmin z8.s, p2/M, z8.s, z16.s\n" + "st1w { z10.s }, p0, [x20, x13, LSL #2]\n" + "st1w { z8.s }, p0, [x19, x13, LSL #2]\n" + : [params] "+r" (params) + : [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((unsigned long) n_channels), [outptrs] "r" (outptrs) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..d899255e84 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x8, #0x0\n" + "mov x17, #0x0\n" + "1:" // Tile loop + "str x8, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x21, #0x2\n" + "str x17, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "cntb x16\n" + "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n" + "add x16, x16, XZR, LSL #4\n" + "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cntb x14\n" + "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "cntb x12\n" + "ldr x11, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x8, x20\n" // offset = tile_i * ld_input_row + "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x17, x13, x19\n" // offset += tile_j * ld_input_col + "ldr x10, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x21\n" // offset *= kernel_stride * output_size + "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x11, x11, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x28, x11, x20, LSL #2\n" + "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x27, x28, x20, LSL #2\n" + "ld1w { z16.s }, p3/Z, [x15]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n" + "add x26, x27, x20, LSL #2\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n" + "add x25, x13, x13\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n" + "add x24, x25, x13\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n" + "add x14, x14, x13, LSL #4\n" + "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n" + "add x12, x12, x25, LSL #4\n" + "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n" + "cntb x23\n" + "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n" + "add x23, x23, x24, LSL #4\n" + "prfm pldl1keep, [x28, x14]\n" + "mov x20, #0x2\n" + "prfm pldl1keep, [x11, x16]\n" + "mul x19, x8, x22\n" // offset = tile_i * ld_output_row + "prfm pldl1keep, [x11, x23]\n" + "madd x19, x17, x10, x19\n" // offset += tile_j * ld_output_col + "prfm pldl1keep, [x28, x12]\n" + "mul x19, x19, x20\n" // offset *= output_tile_size + "prfm pldl1keep, [x27, x14]\n" + "add x9, x9, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "mov x21, #0x0\n" + "add x22, x9, x22, LSL #2\n" + "cntw x20\n" + "sub x19, XZR, x20\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "ld1w { z9.s }, p2/Z, [x28, x13, LSL #2]\n" + "ld1w { z10.s }, p2/Z, [x11]\n" + "addvl x15, x15, #16\n" + "ld1w { z11.s }, p2/Z, [x11, x24, LSL #2]\n" + "cmp x20, %x[n_channels]\n" + "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n" + "addvl x15, x15, #-6\n" + "ld1w { z12.s }, p2/Z, [x28, x25, LSL #2]\n" + "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x26, x16]\n" + "whilelt p1.s, x20, %x[n_channels]\n" + "fmla z30.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x26, x23]\n" + "incw x19\n" + "fmla z29.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x11, x14]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x26]\n" + "incw x21\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x11, x12]\n" + "incw x20\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x24, LSL #2]\n" + "ld1w { z10.s }, p2/Z, [x27, x25, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "prfm pldl1keep, [x27, x12]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x28, x16]\n" + "fmla z31.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x28, x23]\n" + "fmla z30.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x13, LSL #2]\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x11, x25, LSL #2]\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "addvl x11, x11, #1\n" + "fmla z31.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x27, x16]\n" + "prfm pldl1keep, [x27, x23]\n" + "fmla z30.s, p3/M, z6.s, z13.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x26, x12]\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x28]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "ld1w { z16.s }, p3/Z, [x15]\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z10.s\n" + "addvl x28, x28, #1\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x28, x14]\n" + "prfm pldl1keep, [x11, x16]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "prfm pldl1keep, [x11, x23]\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x27]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "prfm pldl1keep, [x28, x12]\n" + "fmla z28.s, p3/M, z2.s, z12.s\n" + "ld1w { z0.s }, p3/Z, [x15, #1, MUL VL]\n" + "fmla z31.s, p3/M, z8.s, z10.s\n" + "ld1w { z1.s }, p3/Z, [x15, #2, MUL VL]\n" + "fmla z30.s, p3/M, z7.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x27, x24, LSL #2]\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "addvl x27, x27, #1\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z28.s, p3/M, z5.s, z10.s\n" + "ld1w { z13.s }, p1/Z, [x27, x13, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n" + "whilelt p2.s, x21, %x[n_channels]\n" + "fmla z29.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x27, x14]\n" + "addvl x26, x26, #1\n" + "fmla z31.s, p3/M, z6.s, z9.s\n" + "ld1w { z9.s }, p1/Z, [x28, x13, LSL #2]\n" + "cmp x20, %x[n_channels]\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "ld1w { z10.s }, p1/Z, [x11]\n" + "fmla z28.s, p3/M, z6.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x11, x24, LSL #2]\n" + "ld1w { z2.s }, p3/Z, [x15, #3, MUL VL]\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "ld1w { z3.s }, p3/Z, [x15, #4, MUL VL]\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z4.s }, p3/Z, [x15, #5, MUL VL]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z12.s }, p1/Z, [x28, x25, LSL #2]\n" + "ld1w { z5.s }, p3/Z, [x15, #6, MUL VL]\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "ld1w { z6.s }, p3/Z, [x15, #7, MUL VL]\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "addvl x15, x15, #16\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "ld1w { z7.s }, p3/Z, [x15, #-8, MUL VL]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z8.s }, p3/Z, [x15, #-7, MUL VL]\n" + "addvl x15, x15, #-6\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z31.s }, p0, [x9]\n" + "mov z31.d, z16.d\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z30.s }, p0, [x9, x10, LSL #2]\n" + "addvl x9, x9, #1\n" + "mov z30.d, z16.d\n" + "st1w { z29.s }, p0, [x22]\n" + "mov z29.d, z16.d\n" + "st1w { z28.s }, p0, [x22, x10, LSL #2]\n" + "mov z28.d, z16.d\n" + "addvl x22, x22, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x26, x16]\n" + "mov p0.b, p2.b\n" + "fmla z30.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x26, x23]\n" + "fmla z29.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x11, x14]\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x26]\n" + "prfm pldl1keep, [x11, x12]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x27, x25, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "ld1w { z11.s }, p2/Z, [x26, x24, LSL #2]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x27, x12]\n" + "prfm pldl1keep, [x28, x16]\n" + "fmla z31.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x28, x23]\n" + "fmla z30.s, p3/M, z4.s, z12.s\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ld1w { z12.s }, p2/Z, [x11, x13, LSL #2]\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "ld1w { z9.s }, p2/Z, [x11, x25, LSL #2]\n" + "prfm pldl1keep, [x27, x16]\n" + "fmla z31.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x27, x23]\n" + "fmla z30.s, p3/M, z6.s, z13.s\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x28]\n" + "prfm pldl1keep, [x26, x12]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "ldr x8, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "add x21, x8, #0x1\n" + "fmla z29.s, p3/M, z5.s, z10.s\n" + "ld1w { z12.s }, p2/Z, [x28, x24, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "ldr x17, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x17, x17, #0x1\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x27]\n" + "fmla z31.s, p3/M, z8.s, z10.s\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x17, x19\n" + "fmla z30.s, p3/M, z7.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x27, x24, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "csel x17, x17, XZR, LT\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n" + "csel x8, x8, x21, LT\n" + "fmla z28.s, p3/M, z2.s, z12.s\n" + "cmp x8, x20\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x26, x25, LSL #2]\n" + "fmla z31.s, p3/M, z6.s, z9.s\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "fmla z28.s, p3/M, z5.s, z10.s\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "fmla z29.s, p3/M, z7.s, z11.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "fmla z28.s, p3/M, z6.s, z11.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x9]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "st1w { z30.s }, p0, [x9, x10, LSL #2]\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z29.s }, p0, [x22]\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z28.s }, p0, [x22, x10, LSL #2]\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..e8a1539437 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[16]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[0]; + inptrs[1] = input_ptrs[1]; + inptrs[2] = input_ptrs[2]; + inptrs[3] = input_ptrs[3]; + inptrs[4] = input_ptrs[4]; + inptrs[5] = input_ptrs[5]; + inptrs[6] = input_ptrs[6]; + inptrs[7] = input_ptrs[7]; + inptrs[8] = input_ptrs[8]; + inptrs[9] = input_ptrs[9]; + inptrs[10] = input_ptrs[10]; + inptrs[11] = input_ptrs[11]; + inptrs[12] = input_ptrs[12]; + inptrs[13] = input_ptrs[13]; + inptrs[14] = input_ptrs[14]; + inptrs[15] = input_ptrs[15]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x2, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n" + "add x19, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "cntb x4, ALL, MUL #2\n" + "ldp x5, x6, [x19, #0x0]\n" + "mov x7, #0x0\n" + "ldp x8, x17, [x19, #0x10]\n" + "cntw x16\n" + "ldp x15, x14, [x19, #0x20]\n" + "sub x13, XZR, x16\n" + "ldp x12, x11, [x19, #0x30]\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "ldp x10, x9, [x19, #0x40]\n" + "cmp x16, %x[n_channels]\n" + "ldp x28, x27, [x19, #0x50]\n" + "ldp x26, x25, [x19, #0x60]\n" + "ldp x24, x23, [x19, #0x70]\n" + "ldp x22, x21, [x2, #0x0]\n" + "ldp x20, x19, [x2, #0x10]\n" + "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "ld1w { z16.s }, p3/Z, [x3]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x3, #1, MUL VL]\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x3, #2, MUL VL]\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x3, #3, MUL VL]\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x3, #4, MUL VL]\n" + "ld1w { z4.s }, p3/Z, [x3, #5, MUL VL]\n" + "ld1w { z5.s }, p3/Z, [x3, #6, MUL VL]\n" + "ld1w { z6.s }, p3/Z, [x3, #7, MUL VL]\n" + "addvl x3, x3, #16\n" + "ld1w { z9.s }, p2/Z, [x14, x7, LSL #2]\n" + "ld1w { z7.s }, p3/Z, [x3, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x3, #-7, MUL VL]\n" + "addvl x3, x3, #-6\n" + "prfm pldl1keep, [x14, x4]\n" + "ld1w { z10.s }, p2/Z, [x5, x7, LSL #2]\n" + "prfm pldl1keep, [x5, x4]\n" + "ld1w { z11.s }, p2/Z, [x17, x7, LSL #2]\n" + "prfm pldl1keep, [x17, x4]\n" + "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n" + "prfm pldl1keep, [x12, x4]\n" + "ld1w { z13.s }, p2/Z, [x9, x7, LSL #2]\n" + "prfm pldl1keep, [x9, x4]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x26, x4]\n" + "whilelt p1.s, x16, %x[n_channels]\n" + "fmla z30.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x23, x4]\n" + "incw x13\n" + "fmla z29.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x6, x4]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x26, x7, LSL #2]\n" + "prfm pldl1keep, [x8, x4]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x23, x7, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "prfm pldl1keep, [x28, x4]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x15, x4]\n" + "fmla z31.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x11, x4]\n" + "fmla z30.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x6, x7, LSL #2]\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "prfm pldl1keep, [x10, x4]\n" + "fmla z31.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x27, x4]\n" + "fmla z30.s, p3/M, z6.s, z13.s\n" + "prfm pldl1keep, [x25, x4]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x24, x4]\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "addvl x4, x4, #1\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "ld1w { z11.s }, p2/Z, [x15, x7, LSL #2]\n" + "prfm pldl1keep, [x14, x4]\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x5, x4]\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x17, x4]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "prfm pldl1keep, [x12, x4]\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x10, x7, LSL #2]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z13.s }, p1/Z, [x9, x16, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z12.s\n" + "prfm pldl1keep, [x9, x4]\n" + "fmla z31.s, p3/M, z8.s, z10.s\n" + "ld1w { z16.s }, p3/Z, [x3]\n" + "fmla z30.s, p3/M, z7.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x27, x7, LSL #2]\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "ld1w { z0.s }, p3/Z, [x3, #1, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x25, x7, LSL #2]\n" + "fmla z28.s, p3/M, z5.s, z10.s\n" + "ld1w { z1.s }, p3/Z, [x3, #2, MUL VL]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n" + "incw x7\n" + "fmla z29.s, p3/M, z7.s, z11.s\n" + "ld1w { z2.s }, p3/Z, [x3, #3, MUL VL]\n" + "whilelt p2.s, x7, %x[n_channels]\n" + "fmla z31.s, p3/M, z6.s, z9.s\n" + "ld1w { z9.s }, p1/Z, [x14, x16, LSL #2]\n" + "fmla z28.s, p3/M, z6.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x17, x16, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "ld1w { z10.s }, p1/Z, [x5, x16, LSL #2]\n" + "ld1w { z3.s }, p3/Z, [x3, #4, MUL VL]\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "ld1w { z4.s }, p3/Z, [x3, #5, MUL VL]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z12.s }, p1/Z, [x12, x16, LSL #2]\n" + "incw x16\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z5.s }, p3/Z, [x3, #6, MUL VL]\n" + "cmp x16, %x[n_channels]\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "ld1w { z6.s }, p3/Z, [x3, #7, MUL VL]\n" + "addvl x3, x3, #16\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "ld1w { z7.s }, p3/Z, [x3, #-8, MUL VL]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z8.s }, p3/Z, [x3, #-7, MUL VL]\n" + "addvl x3, x3, #-6\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "st1w { z31.s }, p0, [x22, x13, LSL #2]\n" + "mov z31.d, z16.d\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x21, x13, LSL #2]\n" + "mov z30.d, z16.d\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z29.s }, p0, [x20, x13, LSL #2]\n" + "mov z29.d, z16.d\n" + "st1w { z28.s }, p0, [x19, x13, LSL #2]\n" + "mov z28.d, z16.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x26, x4]\n" + "incw x13\n" + "fmla z30.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x23, x4]\n" + "mov p0.b, p2.b\n" + "fmla z29.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x6, x4]\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x26, x7, LSL #2]\n" + "prfm pldl1keep, [x8, x4]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x23, x7, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x28, x4]\n" + "prfm pldl1keep, [x15, x4]\n" + "fmla z31.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x11, x4]\n" + "fmla z30.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x6, x7, LSL #2]\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n" + "prfm pldl1keep, [x10, x4]\n" + "fmla z31.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x27, x4]\n" + "fmla z30.s, p3/M, z6.s, z13.s\n" + "prfm pldl1keep, [x25, x4]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x15, x7, LSL #2]\n" + "prfm pldl1keep, [x24, x4]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x7, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z10.s\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x10, x7, LSL #2]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "fmla z28.s, p3/M, z2.s, z12.s\n" + "fmla z31.s, p3/M, z8.s, z10.s\n" + "fmla z30.s, p3/M, z7.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x27, x7, LSL #2]\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x25, x7, LSL #2]\n" + "fmla z28.s, p3/M, z5.s, z10.s\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n" + "fmla z29.s, p3/M, z7.s, z11.s\n" + "fmla z31.s, p3/M, z6.s, z9.s\n" + "fmla z28.s, p3/M, z6.s, z11.s\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x22, x13, LSL #2]\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x21, x13, LSL #2]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "st1w { z29.s }, p0, [x20, x13, LSL #2]\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z28.s }, p0, [x19, x13, LSL #2]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp new file mode 100644 index 0000000000..173fc631d8 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float); + +struct sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided +{ + typedef float bias_type; + typedef float operand_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const, const size_t, const size_t, float *const, const size_t, const size_t, const void *, unsigned long, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + kern_type kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl; + + sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp new file mode 100644 index 0000000000..cecc192c49 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided_impl( + const float *const inptr, + const size_t in_row_stride, + const size_t in_col_stride, + float *const outptr, + const size_t out_row_stride, + const size_t out_col_stride, + const void *params, + unsigned long n_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ptrue p2.b\n" + "ld1w { z15.s }, p2/Z, [%x[params]]\n" + "mov z14.d, z15.d\n" + "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "whilelt p1.s, XZR, %x[n_channels]\n" + "mov z12.d, z15.d\n" + "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "mov x26, %x[inptr]\n" + "mov z10.d, z15.d\n" + "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "add x25, x26, %x[in_row_stride], LSL #2\n" + "mov z8.d, z15.d\n" + "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "add x24, x25, %x[in_row_stride], LSL #2\n" + "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "add x23, x24, %x[in_row_stride], LSL #2\n" + "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "mov x22, %x[outptr]\n" + "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "add x21, x22, %x[out_row_stride], LSL #2\n" + "ld1w { z3.s }, p1/Z, [x26]\n" + "add x20, %x[in_col_stride], %x[in_col_stride]\n" + "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n" + "add x19, x20, %x[in_col_stride]\n" + "ld1w { z1.s }, p1/Z, [x25]\n" + "addvl %x[params], %x[params], #16\n" + "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n" + "decw %x[n_channels]\n" + "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n" + "cmp %x[n_channels], XZR\n" + "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n" + "addvl %x[params], %x[params], #-6\n" + "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n" + "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n" + "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n" + "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n" + "ld1w { z25.s }, p1/Z, [x24]\n" + "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n" + "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n" + "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n" + "ld1w { z21.s }, p1/Z, [x23]\n" + "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n" + "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n" + "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n" + "ld1rw { z17.s }, p2/Z, [%x[minmax_vals]]\n" + "ld1rw { z16.s }, p2/Z, [%x[minmax_vals], #4]\n" + "ble 2f\n" + "1:" // Loop + "fmla z14.s, p2/M, z13.s, z3.s\n" + "ld1w { z15.s }, p2/Z, [%x[params]]\n" + "addvl x26, x26, #1\n" + "fmla z12.s, p2/M, z13.s, z2.s\n" + "addvl x25, x25, #1\n" + "fmla z10.s, p2/M, z13.s, z1.s\n" + "addvl x24, x24, #1\n" + "fmla z8.s, p2/M, z13.s, z0.s\n" + "ld1w { z13.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "addvl x23, x23, #1\n" + "fmla z14.s, p2/M, z11.s, z2.s\n" + "decw %x[n_channels]\n" + "mov p0.b, p1.b\n" + "fmla z12.s, p2/M, z11.s, z29.s\n" + "fmla z10.s, p2/M, z11.s, z0.s\n" + "whilelt p1.s, XZR, %x[n_channels]\n" + "ld1w { z3.s }, p1/Z, [x26]\n" + "fmla z8.s, p2/M, z11.s, z28.s\n" + "cmp %x[n_channels], XZR\n" + "fmla z14.s, p2/M, z9.s, z29.s\n" + "ld1w { z11.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "ld1w { z2.s }, p1/Z, [x26, %x[in_col_stride], LSL #2]\n" + "fmla z12.s, p2/M, z9.s, z27.s\n" + "fmla z10.s, p2/M, z9.s, z28.s\n" + "ld1w { z29.s }, p1/Z, [x26, x20, LSL #2]\n" + "ld1w { z27.s }, p1/Z, [x26, x19, LSL #2]\n" + "fmla z8.s, p2/M, z9.s, z26.s\n" + "ld1w { z9.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "fmla z14.s, p2/M, z7.s, z1.s\n" + "ld1w { z1.s }, p1/Z, [x25]\n" + "fmla z12.s, p2/M, z7.s, z0.s\n" + "fmla z10.s, p2/M, z7.s, z25.s\n" + "fmla z8.s, p2/M, z7.s, z24.s\n" + "ld1w { z7.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "fmla z14.s, p2/M, z6.s, z0.s\n" + "ld1w { z0.s }, p1/Z, [x25, %x[in_col_stride], LSL #2]\n" + "fmla z12.s, p2/M, z6.s, z28.s\n" + "fmla z10.s, p2/M, z6.s, z24.s\n" + "fmla z8.s, p2/M, z6.s, z23.s\n" + "ld1w { z6.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "fmla z14.s, p2/M, z5.s, z28.s\n" + "ld1w { z28.s }, p1/Z, [x25, x20, LSL #2]\n" + "fmla z12.s, p2/M, z5.s, z26.s\n" + "ld1w { z26.s }, p1/Z, [x25, x19, LSL #2]\n" + "fmla z10.s, p2/M, z5.s, z23.s\n" + "fmla z8.s, p2/M, z5.s, z22.s\n" + "ld1w { z5.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "fmla z14.s, p2/M, z4.s, z25.s\n" + "ld1w { z25.s }, p1/Z, [x24]\n" + "fmla z12.s, p2/M, z4.s, z24.s\n" + "fmla z10.s, p2/M, z4.s, z21.s\n" + "ld1w { z21.s }, p1/Z, [x23]\n" + "fmla z8.s, p2/M, z4.s, z20.s\n" + "ld1w { z4.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "fmla z14.s, p2/M, z31.s, z24.s\n" + "ld1w { z24.s }, p1/Z, [x24, %x[in_col_stride], LSL #2]\n" + "fmla z12.s, p2/M, z31.s, z23.s\n" + "fmla z10.s, p2/M, z31.s, z20.s\n" + "ld1w { z20.s }, p1/Z, [x23, %x[in_col_stride], LSL #2]\n" + "fmla z8.s, p2/M, z31.s, z19.s\n" + "ld1w { z31.s }, p2/Z, [%x[params], #-8, MUL VL]\n" + "fmla z14.s, p2/M, z30.s, z23.s\n" + "ld1w { z23.s }, p1/Z, [x24, x20, LSL #2]\n" + "fmla z12.s, p2/M, z30.s, z22.s\n" + "ld1w { z22.s }, p1/Z, [x24, x19, LSL #2]\n" + "fmla z10.s, p2/M, z30.s, z19.s\n" + "ld1w { z19.s }, p1/Z, [x23, x20, LSL #2]\n" + "fmla z8.s, p2/M, z30.s, z18.s\n" + "ld1w { z30.s }, p2/Z, [%x[params], #-7, MUL VL]\n" + "addvl %x[params], %x[params], #-6\n" + "fmax z14.s, p2/M, z14.s, z17.s\n" + "ld1w { z18.s }, p1/Z, [x23, x19, LSL #2]\n" + "fmax z12.s, p2/M, z12.s, z17.s\n" + "fmax z10.s, p2/M, z10.s, z17.s\n" + "fmax z8.s, p2/M, z8.s, z17.s\n" + "fmin z14.s, p2/M, z14.s, z16.s\n" + "st1w { z14.s }, p0, [x22]\n" + "mov z14.d, z15.d\n" + "fmin z12.s, p2/M, z12.s, z16.s\n" + "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n" + "mov z12.d, z15.d\n" + "addvl x22, x22, #1\n" + "fmin z10.s, p2/M, z10.s, z16.s\n" + "st1w { z10.s }, p0, [x21]\n" + "mov z10.d, z15.d\n" + "fmin z8.s, p2/M, z8.s, z16.s\n" + "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n" + "mov z8.d, z15.d\n" + "addvl x21, x21, #1\n" + "bgt 1b\n" + "2:" // Tail + "fmla z14.s, p2/M, z13.s, z3.s\n" + "mov p0.b, p1.b\n" + "fmla z12.s, p2/M, z13.s, z2.s\n" + "fmla z10.s, p2/M, z13.s, z1.s\n" + "fmla z8.s, p2/M, z13.s, z0.s\n" + "fmla z14.s, p2/M, z11.s, z2.s\n" + "fmla z12.s, p2/M, z11.s, z29.s\n" + "fmla z10.s, p2/M, z11.s, z0.s\n" + "fmla z8.s, p2/M, z11.s, z28.s\n" + "fmla z14.s, p2/M, z9.s, z29.s\n" + "fmla z12.s, p2/M, z9.s, z27.s\n" + "fmla z10.s, p2/M, z9.s, z28.s\n" + "fmla z8.s, p2/M, z9.s, z26.s\n" + "fmla z14.s, p2/M, z7.s, z1.s\n" + "fmla z12.s, p2/M, z7.s, z0.s\n" + "fmla z10.s, p2/M, z7.s, z25.s\n" + "fmla z8.s, p2/M, z7.s, z24.s\n" + "fmla z14.s, p2/M, z6.s, z0.s\n" + "fmla z12.s, p2/M, z6.s, z28.s\n" + "fmla z10.s, p2/M, z6.s, z24.s\n" + "fmla z8.s, p2/M, z6.s, z23.s\n" + "fmla z14.s, p2/M, z5.s, z28.s\n" + "fmla z12.s, p2/M, z5.s, z26.s\n" + "fmla z10.s, p2/M, z5.s, z23.s\n" + "fmla z8.s, p2/M, z5.s, z22.s\n" + "fmla z14.s, p2/M, z4.s, z25.s\n" + "fmla z12.s, p2/M, z4.s, z24.s\n" + "fmla z10.s, p2/M, z4.s, z21.s\n" + "fmla z8.s, p2/M, z4.s, z20.s\n" + "fmla z14.s, p2/M, z31.s, z24.s\n" + "fmla z12.s, p2/M, z31.s, z23.s\n" + "fmla z10.s, p2/M, z31.s, z20.s\n" + "fmla z8.s, p2/M, z31.s, z19.s\n" + "fmla z14.s, p2/M, z30.s, z23.s\n" + "fmla z12.s, p2/M, z30.s, z22.s\n" + "fmla z10.s, p2/M, z30.s, z19.s\n" + "fmla z8.s, p2/M, z30.s, z18.s\n" + "fmax z14.s, p2/M, z14.s, z17.s\n" + "fmax z12.s, p2/M, z12.s, z17.s\n" + "fmax z10.s, p2/M, z10.s, z17.s\n" + "fmax z8.s, p2/M, z8.s, z17.s\n" + "fmin z14.s, p2/M, z14.s, z16.s\n" + "st1w { z14.s }, p0, [x22]\n" + "fmin z12.s, p2/M, z12.s, z16.s\n" + "fmin z10.s, p2/M, z10.s, z16.s\n" + "st1w { z12.s }, p0, [x22, %x[out_col_stride], LSL #2]\n" + "fmin z8.s, p2/M, z8.s, z16.s\n" + "st1w { z10.s }, p0, [x21]\n" + "st1w { z8.s }, p0, [x21, %x[out_col_stride], LSL #2]\n" + : [n_channels] "+r" (n_channels), [params] "+r" (params) + : [in_col_stride] "r" (in_col_stride), [in_row_stride] "r" (in_row_stride), [inptr] "r" (inptr), [minmax_vals] "r" (minmax_vals), [out_col_stride] "r" (out_col_stride), [out_row_stride] "r" (out_row_stride), [outptr] "r" (outptr) + : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp new file mode 100644 index 0000000000..5ec78aa05f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 3; + constexpr static unsigned int output_cols = 3; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + + sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..4d0bd311cc --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x3, #0x0\n" + "mov x4, #0x0\n" + "1:" // Tile loop + "str x3, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x22, #0x3\n" + "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "cntb x5\n" + "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n" + "add x5, x5, XZR, LSL #4\n" + "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cntb x7\n" + "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "cntb x17\n" + "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x3, x20\n" // offset = tile_i * ld_input_row + "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x4, x8, x19\n" // offset += tile_j * ld_input_col + "ldr x15, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x22\n" // offset *= kernel_stride * output_size + "ldr x14, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x16, x16, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x13, x16, x20, LSL #2\n" + "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x12, x13, x20, LSL #2\n" + "ld1w { z16.s }, p3/Z, [x6]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n" + "add x11, x12, x20, LSL #2\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n" + "add x10, x11, x20, LSL #2\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n" + "add x9, x8, x8\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n" + "add x28, x9, x8\n" + "mov z27.d, z16.d\n" + "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n" + "add x27, x28, x8\n" + "mov z26.d, z16.d\n" + "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n" + "add x7, x7, x8, LSL #4\n" + "mov z25.d, z16.d\n" + "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n" + "add x17, x17, x9, LSL #4\n" + "mov z24.d, z16.d\n" + "prfm pldl1keep, [x12, x17]\n" + "cntb x26\n" + "mov z23.d, z16.d\n" + "prfm pldl1keep, [x16, x5]\n" + "add x26, x26, x28, LSL #4\n" + "cntb x25\n" + "mov x20, #0x3\n" + "add x25, x25, x27, LSL #4\n" + "prfm pldl1keep, [x16, x25]\n" + "prfm pldl1keep, [x10, x5]\n" + "mul x19, x3, x21\n" // offset = tile_i * ld_output_row + "prfm pldl1keep, [x13, x17]\n" + "madd x19, x4, x15, x19\n" // offset += tile_j * ld_output_col + "add x24, x15, x15\n" + "mul x19, x19, x20\n" // offset *= output_tile_size + "add x14, x14, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x23, x14, x21, LSL #2\n" + "add x22, x23, x21, LSL #2\n" + "mov x21, #0x0\n" + "cntw x20\n" + "sub x19, XZR, x20\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "ld1w { z9.s }, p2/Z, [x12, x9, LSL #2]\n" + "ld1w { z10.s }, p2/Z, [x16]\n" + "addvl x6, x6, #16\n" + "ld1w { z11.s }, p2/Z, [x16, x27, LSL #2]\n" + "cmp x20, %x[n_channels]\n" + "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n" + "addvl x6, x6, #-6\n" + "ld1w { z12.s }, p2/Z, [x10]\n" + "ld1w { z13.s }, p2/Z, [x13, x9, LSL #2]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.s, p3/M, z8.s, z9.s\n" + "prfm pldl1keep, [x10, x25]\n" + "whilelt p1.s, x20, %x[n_channels]\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "prfm pldl1keep, [x12, x7]\n" + "incw x19\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "prfm pldl1keep, [x16, x7]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x16, x26]\n" + "incw x21\n" + "fmla z27.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x12, x26]\n" + "incw x20\n" + "fmla z26.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x13, x5]\n" + "fmla z25.s, p3/M, z2.s, z9.s\n" + "prfm pldl1keep, [x13, x25]\n" + "fmla z24.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x11, x5]\n" + "fmla z23.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x11, x17]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n" + "fmla z25.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x27, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x11, x25]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x10, x7]\n" + "fmla z29.s, p3/M, z3.s, z13.s\n" + "prfm pldl1keep, [x13, x7]\n" + "fmla z28.s, p3/M, z2.s, z13.s\n" + "prfm pldl1keep, [x13, x26]\n" + "fmla z27.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x10, x26]\n" + "fmla z26.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x16, x8, LSL #2]\n" + "fmla z23.s, p3/M, z8.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x16, x28, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x11, x7]\n" + "fmla z30.s, p3/M, z6.s, z11.s\n" + "prfm pldl1keep, [x16, x17]\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "prfm pldl1keep, [x11, x26]\n" + "fmla z27.s, p3/M, z3.s, z11.s\n" + "prfm pldl1keep, [x12, x5]\n" + "fmla z25.s, p3/M, z1.s, z11.s\n" + "prfm pldl1keep, [x12, x25]\n" + "fmla z24.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x10, x17]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x13, x27, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "ld1w { z16.s }, p3/Z, [x6]\n" + "fmla z27.s, p3/M, z5.s, z10.s\n" + "fmla z26.s, p3/M, z4.s, z10.s\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11]\n" + "fmla z29.s, p3/M, z7.s, z10.s\n" + "fmla z24.s, p3/M, z2.s, z10.s\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z13.s\n" + "fmla z26.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x8, LSL #2]\n" + "fmla z25.s, p3/M, z3.s, z12.s\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x13, x8, LSL #2]\n" + "fmla z27.s, p3/M, z7.s, z10.s\n" + "fmla z26.s, p3/M, z6.s, z10.s\n" + "fmla z25.s, p3/M, z5.s, z10.s\n" + "fmla z28.s, p3/M, z8.s, z10.s\n" + "fmla z24.s, p3/M, z4.s, z10.s\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "fmla z26.s, p3/M, z8.s, z11.s\n" + "fmla z25.s, p3/M, z7.s, z13.s\n" + "fmla z24.s, p3/M, z6.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x28, LSL #2]\n" + "fmla z23.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n" + "addvl x13, x13, #1\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "fmla z27.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "fmla z30.s, p3/M, z5.s, z11.s\n" + "fmla z26.s, p3/M, z1.s, z11.s\n" + "fmla z27.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x16, x9, LSL #2]\n" + "addvl x16, x16, #1\n" + "fmla z24.s, p3/M, z8.s, z13.s\n" + "ld1w { z10.s }, p1/Z, [x16]\n" + "fmla z23.s, p3/M, z7.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n" + "addvl x11, x11, #1\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmla z27.s, p3/M, z6.s, z12.s\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x12]\n" + "fmla z31.s, p3/M, z2.s, z11.s\n" + "fmla z30.s, p3/M, z1.s, z11.s\n" + "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n" + "addvl x12, x12, #1\n" + "fmla z27.s, p3/M, z8.s, z13.s\n" + "ld1w { z9.s }, p1/Z, [x12, x9, LSL #2]\n" + "fmla z26.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x12, x17]\n" + "fmla z24.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x16, x5]\n" + "fmla z23.s, p3/M, z4.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n" + "whilelt p2.s, x21, %x[n_channels]\n" + "fmla z31.s, p3/M, z6.s, z12.s\n" + "prfm pldl1keep, [x16, x25]\n" + "addvl x10, x10, #1\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x10, x5]\n" + "cmp x20, %x[n_channels]\n" + "fmla z25.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p1/Z, [x10]\n" + "fmla z29.s, p3/M, z8.s, z11.s\n" + "prfm pldl1keep, [x13, x17]\n" + "fmla z26.s, p3/M, z5.s, z11.s\n" + "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n" + "fmla z23.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x16, x27, LSL #2]\n" + "fmla z24.s, p3/M, z7.s, z13.s\n" + "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n" + "fmla z25.s, p3/M, z8.s, z13.s\n" + "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n" + "fmla z23.s, p3/M, z6.s, z13.s\n" + "ld1w { z13.s }, p1/Z, [x13, x9, LSL #2]\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n" + "addvl x6, x6, #16\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n" + "addvl x6, x6, #-6\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "st1w { z31.s }, p0, [x14]\n" + "mov z31.d, z16.d\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x14, x15, LSL #2]\n" + "mov z30.d, z16.d\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z29.s }, p0, [x14, x24, LSL #2]\n" + "mov z29.d, z16.d\n" + "addvl x14, x14, #1\n" + "fmax z27.s, p3/M, z27.s, z18.s\n" + "st1w { z28.s }, p0, [x23]\n" + "mov z28.d, z16.d\n" + "fmax z26.s, p3/M, z26.s, z18.s\n" + "fmax z25.s, p3/M, z25.s, z18.s\n" + "fmax z24.s, p3/M, z24.s, z18.s\n" + "fmin z27.s, p3/M, z27.s, z17.s\n" + "st1w { z27.s }, p0, [x23, x15, LSL #2]\n" + "mov z27.d, z16.d\n" + "fmin z26.s, p3/M, z26.s, z17.s\n" + "st1w { z26.s }, p0, [x23, x24, LSL #2]\n" + "mov z26.d, z16.d\n" + "addvl x23, x23, #1\n" + "fmin z25.s, p3/M, z25.s, z17.s\n" + "st1w { z25.s }, p0, [x22]\n" + "mov z25.d, z16.d\n" + "fmin z24.s, p3/M, z24.s, z17.s\n" + "st1w { z24.s }, p0, [x22, x15, LSL #2]\n" + "mov z24.d, z16.d\n" + "fmax z23.s, p3/M, z23.s, z18.s\n" + "fmin z23.s, p3/M, z23.s, z17.s\n" + "st1w { z23.s }, p0, [x22, x24, LSL #2]\n" + "mov z23.d, z16.d\n" + "addvl x22, x22, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.s, p3/M, z8.s, z9.s\n" + "prfm pldl1keep, [x10, x25]\n" + "mov p0.b, p2.b\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "prfm pldl1keep, [x12, x7]\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "prfm pldl1keep, [x16, x7]\n" + "fmla z28.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x16, x26]\n" + "fmla z27.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x12, x26]\n" + "fmla z26.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x13, x5]\n" + "fmla z25.s, p3/M, z2.s, z9.s\n" + "prfm pldl1keep, [x13, x25]\n" + "fmla z24.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x11, x5]\n" + "fmla z23.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x11, x17]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x8, LSL #2]\n" + "fmla z25.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x27, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x11, x25]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x10, x7]\n" + "fmla z29.s, p3/M, z3.s, z13.s\n" + "prfm pldl1keep, [x13, x7]\n" + "fmla z28.s, p3/M, z2.s, z13.s\n" + "prfm pldl1keep, [x13, x26]\n" + "fmla z27.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x10, x26]\n" + "fmla z26.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x16, x8, LSL #2]\n" + "fmla z23.s, p3/M, z8.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x16, x28, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x11, x7]\n" + "fmla z30.s, p3/M, z6.s, z11.s\n" + "prfm pldl1keep, [x16, x17]\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "prfm pldl1keep, [x11, x26]\n" + "fmla z27.s, p3/M, z3.s, z11.s\n" + "prfm pldl1keep, [x12, x5]\n" + "fmla z25.s, p3/M, z1.s, z11.s\n" + "prfm pldl1keep, [x12, x25]\n" + "fmla z24.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x10, x17]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x13, x27, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "ldr x3, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x3, #0x1\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11]\n" + "fmla z29.s, p3/M, z7.s, z10.s\n" + "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x4, x4, #0x1\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z27.s, p3/M, z5.s, z10.s\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x4, x19\n" + "fmla z26.s, p3/M, z4.s, z10.s\n" + "fmla z24.s, p3/M, z2.s, z10.s\n" + "csel x4, x4, XZR, LT\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11, x9, LSL #2]\n" + "csel x3, x3, x21, LT\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "cmp x3, x20\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z13.s\n" + "fmla z26.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x8, LSL #2]\n" + "fmla z25.s, p3/M, z3.s, z12.s\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x13, x8, LSL #2]\n" + "fmla z27.s, p3/M, z7.s, z10.s\n" + "fmla z26.s, p3/M, z6.s, z10.s\n" + "fmla z25.s, p3/M, z5.s, z10.s\n" + "fmla z28.s, p3/M, z8.s, z10.s\n" + "fmla z24.s, p3/M, z4.s, z10.s\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "fmla z26.s, p3/M, z8.s, z11.s\n" + "fmla z25.s, p3/M, z7.s, z13.s\n" + "fmla z24.s, p3/M, z6.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x28, LSL #2]\n" + "fmla z23.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "fmla z27.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x8, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "fmla z30.s, p3/M, z5.s, z11.s\n" + "fmla z26.s, p3/M, z1.s, z11.s\n" + "fmla z27.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x16, x9, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z13.s\n" + "fmla z23.s, p3/M, z7.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x11, x28, LSL #2]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmla z27.s, p3/M, z6.s, z12.s\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x12]\n" + "fmla z31.s, p3/M, z2.s, z11.s\n" + "fmla z30.s, p3/M, z1.s, z11.s\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z13.s\n" + "fmla z26.s, p3/M, z7.s, z13.s\n" + "fmla z24.s, p3/M, z5.s, z13.s\n" + "fmla z23.s, p3/M, z4.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x9, LSL #2]\n" + "fmla z31.s, p3/M, z6.s, z12.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "fmla z25.s, p3/M, z0.s, z12.s\n" + "fmla z29.s, p3/M, z8.s, z11.s\n" + "fmla z26.s, p3/M, z5.s, z11.s\n" + "fmla z23.s, p3/M, z2.s, z11.s\n" + "fmla z25.s, p3/M, z8.s, z13.s\n" + "fmla z24.s, p3/M, z7.s, z13.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "fmla z23.s, p3/M, z6.s, z13.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x14]\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x14, x15, LSL #2]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "fmax z27.s, p3/M, z27.s, z18.s\n" + "st1w { z29.s }, p0, [x14, x24, LSL #2]\n" + "fmax z26.s, p3/M, z26.s, z18.s\n" + "fmax z25.s, p3/M, z25.s, z18.s\n" + "fmax z24.s, p3/M, z24.s, z18.s\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z28.s }, p0, [x23]\n" + "fmin z27.s, p3/M, z27.s, z17.s\n" + "fmin z26.s, p3/M, z26.s, z17.s\n" + "st1w { z27.s }, p0, [x23, x15, LSL #2]\n" + "fmin z25.s, p3/M, z25.s, z17.s\n" + "fmin z24.s, p3/M, z24.s, z17.s\n" + "st1w { z26.s }, p0, [x23, x24, LSL #2]\n" + "fmax z23.s, p3/M, z23.s, z18.s\n" + "st1w { z25.s }, p0, [x22]\n" + "fmin z23.s, p3/M, z23.s, z17.s\n" + "st1w { z24.s }, p0, [x22, x15, LSL #2]\n" + "st1w { z23.s }, p0, [x22, x24, LSL #2]\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..7c6fb306b7 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[25]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[4]; + inptrs[3] = input_ptrs[20]; + inptrs[4] = input_ptrs[7]; + inptrs[5] = input_ptrs[24]; + inptrs[6] = input_ptrs[11]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[3]; + inptrs[9] = input_ptrs[13]; + inptrs[10] = input_ptrs[5]; + inptrs[11] = input_ptrs[9]; + inptrs[12] = input_ptrs[15]; + inptrs[13] = input_ptrs[17]; + inptrs[14] = input_ptrs[19]; + inptrs[15] = input_ptrs[21]; + inptrs[16] = input_ptrs[6]; + inptrs[17] = input_ptrs[8]; + inptrs[18] = input_ptrs[23]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[18]; + inptrs[22] = input_ptrs[10]; + inptrs[23] = input_ptrs[14]; + inptrs[24] = input_ptrs[22]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n" + "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "cntb x14, ALL, MUL #2\n" + "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "mov x13, #0x0\n" + "ld1w { z16.s }, p3/Z, [x16]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n" + "cntw x12\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n" + "sub x11, XZR, x12\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n" + "cmp x12, %x[n_channels]\n" + "mov z27.d, z16.d\n" + "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n" + "mov z26.d, z16.d\n" + "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n" + "mov z25.d, z16.d\n" + "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n" + "addvl x16, x16, #16\n" + "mov z24.d, z16.d\n" + "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n" + "mov z23.d, z16.d\n" + "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n" + "addvl x16, x16, #-6\n" + "ldp x10, x22, [x15, #0x0]\n" + "ldp x9, x28, [x15, #0x10]\n" + "ldr x24, [x15, #0x20]\n" + "ld1w { z9.s }, p2/Z, [x10, x13, LSL #2]\n" + "prfm pldl1keep, [x10, x14]\n" + "ld1w { z10.s }, p2/Z, [x22, x13, LSL #2]\n" + "prfm pldl1keep, [x22, x14]\n" + "ld1w { z11.s }, p2/Z, [x9, x13, LSL #2]\n" + "prfm pldl1keep, [x9, x14]\n" + "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n" + "prfm pldl1keep, [x28, x14]\n" + "ld1w { z13.s }, p2/Z, [x24, x13, LSL #2]\n" + "prfm pldl1keep, [x24, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x27, [x15, #0x28]\n" + "whilelt p1.s, x12, %x[n_channels]\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "ldr x23, [x15, #0x30]\n" + "incw x11\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ldr x26, [x15, #0x38]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x27, x14]\n" + "fmla z27.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z26.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z25.s, p3/M, z2.s, z9.s\n" + "ldr x25, [x15, #0x40]\n" + "fmla z24.s, p3/M, z1.s, z9.s\n" + "ldr x19, [x15, #0x48]\n" + "fmla z23.s, p3/M, z0.s, z9.s\n" + "ldr x24, [x15, #0x50]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z25.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z13.s\n" + "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z3.s, z13.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z28.s, p3/M, z2.s, z13.s\n" + "ldr x23, [x15, #0x58]\n" + "fmla z27.s, p3/M, z1.s, z13.s\n" + "ldr x22, [x15, #0x60]\n" + "fmla z26.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z23.s, p3/M, z8.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z30.s, p3/M, z6.s, z11.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "ldr x21, [x15, #0x68]\n" + "fmla z27.s, p3/M, z3.s, z11.s\n" + "ldr x20, [x15, #0x70]\n" + "fmla z25.s, p3/M, z1.s, z11.s\n" + "ldr x19, [x15, #0x78]\n" + "fmla z24.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z27.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z26.s, p3/M, z4.s, z10.s\n" + "ldr x10, [x15, #0x80]\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n" + "fmla z29.s, p3/M, z7.s, z10.s\n" + "ldr x22, [x15, #0x88]\n" + "fmla z24.s, p3/M, z2.s, z10.s\n" + "prfm pldl1keep, [x10, x14]\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "ldr x9, [x15, #0x90]\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x9, x14]\n" + "fmla z26.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z25.s, p3/M, z3.s, z12.s\n" + "ldr x28, [x15, #0x98]\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n" + "fmla z27.s, p3/M, z7.s, z10.s\n" + "ldr x24, [x15, #0xa0]\n" + "fmla z26.s, p3/M, z6.s, z10.s\n" + "prfm pldl1keep, [x28, x14]\n" + "fmla z25.s, p3/M, z5.s, z10.s\n" + "ldr x27, [x15, #0xa8]\n" + "fmla z28.s, p3/M, z8.s, z10.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z24.s, p3/M, z4.s, z10.s\n" + "ldr x23, [x15, #0xb0]\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "prfm pldl1keep, [x27, x14]\n" + "fmla z26.s, p3/M, z8.s, z11.s\n" + "ldr x26, [x15, #0xb8]\n" + "fmla z25.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z24.s, p3/M, z6.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x9, x13, LSL #2]\n" + "fmla z23.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "ldr x25, [x15, #0xc0]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ldp x10, x22, [x15, #0x0]\n" + "fmla z27.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "prfm pldl1keep, [x25, x14]\n" + "addvl x14, x14, #1\n" + "fmla z30.s, p3/M, z5.s, z11.s\n" + "ld1w { z9.s }, p1/Z, [x10, x12, LSL #2]\n" + "fmla z26.s, p3/M, z1.s, z11.s\n" + "prfm pldl1keep, [x10, x14]\n" + "fmla z27.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z13.s\n" + "ld1w { z10.s }, p1/Z, [x22, x12, LSL #2]\n" + "fmla z23.s, p3/M, z7.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z27.s, p3/M, z6.s, z12.s\n" + "ldp x9, x28, [x15, #0x10]\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "ldr x24, [x15, #0x20]\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x9, x14]\n" + "fmla z30.s, p3/M, z1.s, z11.s\n" + "prfm pldl1keep, [x28, x14]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z13.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z26.s, p3/M, z7.s, z13.s\n" + "ldr x22, [x17, #0x0]\n" + "fmla z24.s, p3/M, z5.s, z13.s\n" + "ldr x21, [x17, #0x8]\n" + "fmla z23.s, p3/M, z4.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n" + "incw x13\n" + "fmla z31.s, p3/M, z6.s, z12.s\n" + "ldr x20, [x17, #0x10]\n" + "whilelt p2.s, x13, %x[n_channels]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ldr x19, [x17, #0x18]\n" + "fmla z25.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p1/Z, [x28, x12, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z11.s\n" + "ld1w { z16.s }, p3/Z, [x16]\n" + "fmla z26.s, p3/M, z5.s, z11.s\n" + "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n" + "fmla z23.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x9, x12, LSL #2]\n" + "fmla z25.s, p3/M, z8.s, z13.s\n" + "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n" + "fmla z24.s, p3/M, z7.s, z13.s\n" + "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n" + "fmla z23.s, p3/M, z6.s, z13.s\n" + "ld1w { z13.s }, p1/Z, [x24, x12, LSL #2]\n" + "incw x12\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n" + "cmp x12, %x[n_channels]\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n" + "addvl x16, x16, #16\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n" + "fmax z27.s, p3/M, z27.s, z18.s\n" + "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n" + "addvl x16, x16, #-6\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "st1w { z31.s }, p0, [x22, x11, LSL #2]\n" + "mov z31.d, z16.d\n" + "ldr x22, [x17, #0x20]\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x21, x11, LSL #2]\n" + "mov z30.d, z16.d\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z29.s }, p0, [x20, x11, LSL #2]\n" + "mov z29.d, z16.d\n" + "ldr x21, [x17, #0x28]\n" + "fmin z27.s, p3/M, z27.s, z17.s\n" + "ldr x20, [x17, #0x30]\n" + "fmax z26.s, p3/M, z26.s, z18.s\n" + "st1w { z28.s }, p0, [x19, x11, LSL #2]\n" + "mov z28.d, z16.d\n" + "ldr x19, [x17, #0x38]\n" + "fmax z25.s, p3/M, z25.s, z18.s\n" + "st1w { z27.s }, p0, [x22, x11, LSL #2]\n" + "mov z27.d, z16.d\n" + "ldr x22, [x17, #0x40]\n" + "fmin z26.s, p3/M, z26.s, z17.s\n" + "st1w { z26.s }, p0, [x21, x11, LSL #2]\n" + "mov z26.d, z16.d\n" + "fmin z25.s, p3/M, z25.s, z17.s\n" + "st1w { z25.s }, p0, [x20, x11, LSL #2]\n" + "mov z25.d, z16.d\n" + "fmax z24.s, p3/M, z24.s, z18.s\n" + "fmax z23.s, p3/M, z23.s, z18.s\n" + "fmin z24.s, p3/M, z24.s, z17.s\n" + "st1w { z24.s }, p0, [x19, x11, LSL #2]\n" + "mov z24.d, z16.d\n" + "fmin z23.s, p3/M, z23.s, z17.s\n" + "st1w { z23.s }, p0, [x22, x11, LSL #2]\n" + "mov z23.d, z16.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x27, [x15, #0x28]\n" + "incw x11\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "ldr x23, [x15, #0x30]\n" + "mov p0.b, p2.b\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ldr x26, [x15, #0x38]\n" + "fmla z28.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x27, x14]\n" + "fmla z27.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z26.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z25.s, p3/M, z2.s, z9.s\n" + "ldr x25, [x15, #0x40]\n" + "fmla z24.s, p3/M, z1.s, z9.s\n" + "ldr x19, [x15, #0x48]\n" + "fmla z23.s, p3/M, z0.s, z9.s\n" + "ldr x24, [x15, #0x50]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z25.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x13, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z13.s\n" + "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z3.s, z13.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z28.s, p3/M, z2.s, z13.s\n" + "ldr x23, [x15, #0x58]\n" + "fmla z27.s, p3/M, z1.s, z13.s\n" + "ldr x22, [x15, #0x60]\n" + "fmla z26.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z23.s, p3/M, z8.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x25, x13, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z30.s, p3/M, z6.s, z11.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "ldr x21, [x15, #0x68]\n" + "fmla z27.s, p3/M, z3.s, z11.s\n" + "ldr x20, [x15, #0x70]\n" + "fmla z25.s, p3/M, z1.s, z11.s\n" + "ldr x19, [x15, #0x78]\n" + "fmla z24.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z27.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z26.s, p3/M, z4.s, z10.s\n" + "ldr x10, [x15, #0x80]\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n" + "fmla z29.s, p3/M, z7.s, z10.s\n" + "ldr x22, [x15, #0x88]\n" + "fmla z24.s, p3/M, z2.s, z10.s\n" + "prfm pldl1keep, [x10, x14]\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "ldr x9, [x15, #0x90]\n" + "fmla z30.s, p3/M, z8.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x21, x13, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x9, x14]\n" + "fmla z26.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z25.s, p3/M, z3.s, z12.s\n" + "ldr x28, [x15, #0x98]\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x13, LSL #2]\n" + "fmla z27.s, p3/M, z7.s, z10.s\n" + "ldr x24, [x15, #0xa0]\n" + "fmla z26.s, p3/M, z6.s, z10.s\n" + "prfm pldl1keep, [x28, x14]\n" + "fmla z25.s, p3/M, z5.s, z10.s\n" + "ldr x27, [x15, #0xa8]\n" + "fmla z28.s, p3/M, z8.s, z10.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z24.s, p3/M, z4.s, z10.s\n" + "ldr x23, [x15, #0xb0]\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "prfm pldl1keep, [x27, x14]\n" + "fmla z26.s, p3/M, z8.s, z11.s\n" + "ldr x26, [x15, #0xb8]\n" + "fmla z25.s, p3/M, z7.s, z13.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z24.s, p3/M, z6.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x9, x13, LSL #2]\n" + "fmla z23.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "ldr x25, [x15, #0xc0]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ldr x22, [x17, #0x0]\n" + "fmla z27.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x28, x13, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z30.s, p3/M, z5.s, z11.s\n" + "ldr x21, [x17, #0x8]\n" + "fmla z26.s, p3/M, z1.s, z11.s\n" + "ldr x20, [x17, #0x10]\n" + "fmla z27.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z13.s\n" + "ldr x19, [x17, #0x18]\n" + "fmla z23.s, p3/M, z7.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x27, x13, LSL #2]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmla z27.s, p3/M, z6.s, z12.s\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z11.s\n" + "fmla z30.s, p3/M, z1.s, z11.s\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z13.s\n" + "fmla z26.s, p3/M, z7.s, z13.s\n" + "fmla z24.s, p3/M, z5.s, z13.s\n" + "fmla z23.s, p3/M, z4.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n" + "fmla z31.s, p3/M, z6.s, z12.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "fmla z25.s, p3/M, z0.s, z12.s\n" + "fmla z29.s, p3/M, z8.s, z11.s\n" + "fmla z26.s, p3/M, z5.s, z11.s\n" + "fmla z23.s, p3/M, z2.s, z11.s\n" + "fmla z25.s, p3/M, z8.s, z13.s\n" + "fmla z24.s, p3/M, z7.s, z13.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "fmla z23.s, p3/M, z6.s, z13.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x22, x11, LSL #2]\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "ldr x22, [x17, #0x20]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "st1w { z30.s }, p0, [x21, x11, LSL #2]\n" + "fmax z27.s, p3/M, z27.s, z18.s\n" + "fmax z26.s, p3/M, z26.s, z18.s\n" + "st1w { z29.s }, p0, [x20, x11, LSL #2]\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "ldr x21, [x17, #0x28]\n" + "fmax z25.s, p3/M, z25.s, z18.s\n" + "ldr x20, [x17, #0x30]\n" + "fmax z24.s, p3/M, z24.s, z18.s\n" + "st1w { z28.s }, p0, [x19, x11, LSL #2]\n" + "fmin z27.s, p3/M, z27.s, z17.s\n" + "fmin z26.s, p3/M, z26.s, z17.s\n" + "ldr x19, [x17, #0x38]\n" + "fmin z25.s, p3/M, z25.s, z17.s\n" + "st1w { z27.s }, p0, [x22, x11, LSL #2]\n" + "fmin z24.s, p3/M, z24.s, z17.s\n" + "fmax z23.s, p3/M, z23.s, z18.s\n" + "st1w { z26.s }, p0, [x21, x11, LSL #2]\n" + "st1w { z25.s }, p0, [x20, x11, LSL #2]\n" + "fmin z23.s, p3/M, z23.s, z17.s\n" + "st1w { z24.s }, p0, [x19, x11, LSL #2]\n" + "ldr x22, [x17, #0x40]\n" + "st1w { z23.s }, p0, [x22, x11, LSL #2]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp new file mode 100644 index 0000000000..a9823e3917 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + + sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..4c24ad9c15 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x2, #0x0\n" + "mov x3, #0x0\n" + "1:" // Tile loop + "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x24, #0x4\n" + "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x23, #0x4\n" + "ldr x4, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x5, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cntw x6\n" + "ldr x7, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x21, XZR, x6\n" + "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x2, x22\n" // offset = tile_i * ld_input_row + "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x3, x7, x19\n" // offset += tile_j * ld_input_col + "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x24\n" // offset *= kernel_stride * output_size + "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x8, x8, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x15, x8, x22, LSL #2\n" + "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x14, x15, x22, LSL #2\n" + "ld1w { z13.s }, p3/Z, [x4]\n" + "mov z31.d, z13.d\n" + "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n" + "add x13, x14, x22, LSL #2\n" + "mov z30.d, z13.d\n" + "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n" + "add x12, x13, x22, LSL #2\n" + "mov z29.d, z13.d\n" + "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n" + "add x11, x12, x22, LSL #2\n" + "mov z28.d, z13.d\n" + "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n" + "add x10, x7, x7\n" + "mov z27.d, z13.d\n" + "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n" + "add x9, x10, x7\n" + "mov z26.d, z13.d\n" + "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n" + "add x28, x9, x7\n" + "mov z25.d, z13.d\n" + "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n" + "add x27, x28, x7\n" + "mov z24.d, z13.d\n" + "mul x19, x2, x20\n" // offset = tile_i * ld_output_row + "mov z23.d, z13.d\n" + "madd x19, x3, x17, x19\n" // offset += tile_j * ld_output_col + "mov z22.d, z13.d\n" + "mul x19, x19, x23\n" // offset *= output_tile_size + "mov z21.d, z13.d\n" + "add x16, x16, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "mov z20.d, z13.d\n" + "add x26, x16, x20, LSL #2\n" + "mov z19.d, z13.d\n" + "add x25, x26, x20, LSL #2\n" + "mov z18.d, z13.d\n" + "add x24, x25, x20, LSL #2\n" + "mov z17.d, z13.d\n" + "add x23, x17, x17\n" + "mov z16.d, z13.d\n" + "add x22, x23, x17\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "ld1w { z9.s }, p2/Z, [x14, x10, LSL #2]\n" + "ld1w { z10.s }, p2/Z, [x8]\n" + "addvl x4, x4, #16\n" + "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n" + "cmp x6, %x[n_channels]\n" + "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n" + "addvl x4, x4, #-6\n" + "ld1w { z12.s }, p2/Z, [x14, x9, LSL #2]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ld1w { z13.s }, p3/Z, [x4]\n" + "whilelt p1.s, x6, %x[n_channels]\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "incw x21\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "mov p0.b, p2.b\n" + "fmla z27.s, p3/M, z5.s, z9.s\n" + "incw x5\n" + "fmla z26.s, p3/M, z4.s, z9.s\n" + "incw x6\n" + "fmla z25.s, p3/M, z3.s, z9.s\n" + "fmla z23.s, p3/M, z2.s, z9.s\n" + "fmla z22.s, p3/M, z1.s, z9.s\n" + "fmla z21.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z12.s\n" + "fmla z29.s, p3/M, z7.s, z12.s\n" + "fmla z26.s, p3/M, z5.s, z12.s\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "fmla z22.s, p3/M, z2.s, z12.s\n" + "fmla z21.s, p3/M, z1.s, z12.s\n" + "fmla z20.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n" + "fmla z19.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n" + "fmla z16.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z9.s\n" + "fmla z26.s, p3/M, z7.s, z9.s\n" + "fmla z25.s, p3/M, z6.s, z9.s\n" + "fmla z23.s, p3/M, z5.s, z9.s\n" + "fmla z22.s, p3/M, z4.s, z9.s\n" + "fmla z21.s, p3/M, z3.s, z9.s\n" + "fmla z19.s, p3/M, z2.s, z9.s\n" + "fmla z18.s, p3/M, z1.s, z9.s\n" + "fmla z17.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x15]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "fmla z28.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z26.s, p3/M, z8.s, z10.s\n" + "fmla z25.s, p3/M, z7.s, z10.s\n" + "fmla z24.s, p3/M, z6.s, z10.s\n" + "fmla z22.s, p3/M, z5.s, z10.s\n" + "fmla z21.s, p3/M, z4.s, z10.s\n" + "fmla z20.s, p3/M, z3.s, z10.s\n" + "fmla z18.s, p3/M, z2.s, z10.s\n" + "fmla z17.s, p3/M, z1.s, z10.s\n" + "fmla z16.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "fmla z27.s, p3/M, z0.s, z9.s\n" + "fmla z28.s, p3/M, z5.s, z12.s\n" + "fmla z24.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n" + "fmla z23.s, p3/M, z6.s, z11.s\n" + "fmla z19.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n" + "fmla z31.s, p3/M, z5.s, z10.s\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "fmla z29.s, p3/M, z3.s, z10.s\n" + "fmla z27.s, p3/M, z2.s, z10.s\n" + "fmla z26.s, p3/M, z1.s, z10.s\n" + "fmla z25.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n" + "fmla z20.s, p3/M, z8.s, z11.s\n" + "fmla z16.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "fmla z26.s, p3/M, z2.s, z12.s\n" + "fmla z25.s, p3/M, z1.s, z12.s\n" + "fmla z24.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n" + "fmla z19.s, p3/M, z7.s, z11.s\n" + "fmla z18.s, p3/M, z6.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z10.s\n" + "fmla z30.s, p3/M, z6.s, z10.s\n" + "fmla z27.s, p3/M, z4.s, z10.s\n" + "fmla z26.s, p3/M, z3.s, z10.s\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "fmla z22.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n" + "fmla z17.s, p3/M, z8.s, z11.s\n" + "fmla z16.s, p3/M, z7.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmla z25.s, p3/M, z5.s, z12.s\n" + "fmla z24.s, p3/M, z4.s, z12.s\n" + "fmla z21.s, p3/M, z2.s, z12.s\n" + "fmla z20.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n" + "addvl x8, x8, #1\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "fmla z29.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x14]\n" + "fmla z27.s, p3/M, z7.s, z11.s\n" + "fmla z26.s, p3/M, z6.s, z11.s\n" + "fmla z23.s, p3/M, z4.s, z11.s\n" + "fmla z22.s, p3/M, z3.s, z11.s\n" + "fmla z19.s, p3/M, z1.s, z11.s\n" + "fmla z18.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n" + "addvl x14, x14, #1\n" + "fmla z31.s, p3/M, z6.s, z10.s\n" + "ld1w { z9.s }, p1/Z, [x14, x10, LSL #2]\n" + "fmla z27.s, p3/M, z3.s, z10.s\n" + "fmla z23.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x13]\n" + "fmla z25.s, p3/M, z8.s, z11.s\n" + "fmla z24.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z5.s, z11.s\n" + "fmla z20.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z2.s, z11.s\n" + "fmla z16.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n" + "fmla z28.s, p3/M, z8.s, z12.s\n" + "fmla z24.s, p3/M, z5.s, z12.s\n" + "fmla z20.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n" + "addvl x13, x13, #1\n" + "fmla z27.s, p3/M, z6.s, z10.s\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "fmla z19.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n" + "fmla z22.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z6.s, z11.s\n" + "fmla z23.s, p3/M, z8.s, z11.s\n" + "fmla z19.s, p3/M, z5.s, z11.s\n" + "fmla z18.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z12.s\n" + "fmla z20.s, p3/M, z5.s, z12.s\n" + "fmla z16.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n" + "addvl x11, x11, #1\n" + "fmla z19.s, p3/M, z8.s, z10.s\n" + "fmla z18.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n" + "fmla z22.s, p3/M, z8.s, z11.s\n" + "fmla z21.s, p3/M, z7.s, z11.s\n" + "fmla z20.s, p3/M, z6.s, z11.s\n" + "fmla z18.s, p3/M, z5.s, z11.s\n" + "fmla z17.s, p3/M, z4.s, z11.s\n" + "fmla z16.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n" + "addvl x15, x15, #1\n" + "fmla z18.s, p3/M, z8.s, z12.s\n" + "fmla z31.s, p3/M, z4.s, z10.s\n" + "fmla z17.s, p3/M, z7.s, z12.s\n" + "fmla z16.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z10.s\n" + "fmla z27.s, p3/M, z1.s, z10.s\n" + "fmla z26.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n" + "whilelt p2.s, x5, %x[n_channels]\n" + "fmla z29.s, p3/M, z5.s, z11.s\n" + "ld1w { z0.s }, p3/Z, [x4, #1, MUL VL]\n" + "addvl x12, x12, #1\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "cmp x6, %x[n_channels]\n" + "fmla z25.s, p3/M, z2.s, z11.s\n" + "ld1w { z2.s }, p3/Z, [x4, #3, MUL VL]\n" + "fmla z24.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n" + "fmla z23.s, p3/M, z7.s, z12.s\n" + "ld1w { z1.s }, p3/Z, [x4, #2, MUL VL]\n" + "fmla z22.s, p3/M, z6.s, z12.s\n" + "ld1w { z6.s }, p3/Z, [x4, #7, MUL VL]\n" + "fmla z19.s, p3/M, z4.s, z12.s\n" + "fmla z18.s, p3/M, z3.s, z12.s\n" + "ld1w { z12.s }, p1/Z, [x14, x9, LSL #2]\n" + "fmla z21.s, p3/M, z8.s, z10.s\n" + "ld1w { z3.s }, p3/Z, [x4, #4, MUL VL]\n" + "fmla z20.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z5.s, z10.s\n" + "ld1w { z5.s }, p3/Z, [x4, #6, MUL VL]\n" + "fmla z16.s, p3/M, z4.s, z10.s\n" + "ld1w { z10.s }, p1/Z, [x8]\n" + "fmax z31.s, p3/M, z31.s, z15.s\n" + "ld1w { z4.s }, p3/Z, [x4, #5, MUL VL]\n" + "addvl x4, x4, #16\n" + "fmax z30.s, p3/M, z30.s, z15.s\n" + "ld1w { z7.s }, p3/Z, [x4, #-8, MUL VL]\n" + "fmax z29.s, p3/M, z29.s, z15.s\n" + "ld1w { z8.s }, p3/Z, [x4, #-7, MUL VL]\n" + "addvl x4, x4, #-6\n" + "fmin z31.s, p3/M, z31.s, z14.s\n" + "st1w { z31.s }, p0, [x16]\n" + "mov z31.d, z13.d\n" + "fmin z30.s, p3/M, z30.s, z14.s\n" + "st1w { z30.s }, p0, [x16, x17, LSL #2]\n" + "mov z30.d, z13.d\n" + "fmin z29.s, p3/M, z29.s, z14.s\n" + "st1w { z29.s }, p0, [x16, x23, LSL #2]\n" + "mov z29.d, z13.d\n" + "fmax z28.s, p3/M, z28.s, z15.s\n" + "fmax z27.s, p3/M, z27.s, z15.s\n" + "fmax z26.s, p3/M, z26.s, z15.s\n" + "fmax z25.s, p3/M, z25.s, z15.s\n" + "fmin z28.s, p3/M, z28.s, z14.s\n" + "st1w { z28.s }, p0, [x16, x22, LSL #2]\n" + "mov z28.d, z13.d\n" + "addvl x16, x16, #1\n" + "fmin z27.s, p3/M, z27.s, z14.s\n" + "st1w { z27.s }, p0, [x26]\n" + "mov z27.d, z13.d\n" + "fmin z26.s, p3/M, z26.s, z14.s\n" + "st1w { z26.s }, p0, [x26, x17, LSL #2]\n" + "mov z26.d, z13.d\n" + "fmin z25.s, p3/M, z25.s, z14.s\n" + "st1w { z25.s }, p0, [x26, x23, LSL #2]\n" + "mov z25.d, z13.d\n" + "fmax z24.s, p3/M, z24.s, z15.s\n" + "fmax z23.s, p3/M, z23.s, z15.s\n" + "fmax z22.s, p3/M, z22.s, z15.s\n" + "fmax z21.s, p3/M, z21.s, z15.s\n" + "fmin z24.s, p3/M, z24.s, z14.s\n" + "st1w { z24.s }, p0, [x26, x22, LSL #2]\n" + "mov z24.d, z13.d\n" + "addvl x26, x26, #1\n" + "fmin z23.s, p3/M, z23.s, z14.s\n" + "st1w { z23.s }, p0, [x25]\n" + "mov z23.d, z13.d\n" + "fmin z22.s, p3/M, z22.s, z14.s\n" + "st1w { z22.s }, p0, [x25, x17, LSL #2]\n" + "mov z22.d, z13.d\n" + "fmin z21.s, p3/M, z21.s, z14.s\n" + "st1w { z21.s }, p0, [x25, x23, LSL #2]\n" + "mov z21.d, z13.d\n" + "fmax z20.s, p3/M, z20.s, z15.s\n" + "fmax z19.s, p3/M, z19.s, z15.s\n" + "fmax z18.s, p3/M, z18.s, z15.s\n" + "fmax z17.s, p3/M, z17.s, z15.s\n" + "fmin z20.s, p3/M, z20.s, z14.s\n" + "st1w { z20.s }, p0, [x25, x22, LSL #2]\n" + "mov z20.d, z13.d\n" + "addvl x25, x25, #1\n" + "fmin z19.s, p3/M, z19.s, z14.s\n" + "st1w { z19.s }, p0, [x24]\n" + "mov z19.d, z13.d\n" + "fmin z18.s, p3/M, z18.s, z14.s\n" + "st1w { z18.s }, p0, [x24, x17, LSL #2]\n" + "mov z18.d, z13.d\n" + "fmin z17.s, p3/M, z17.s, z14.s\n" + "st1w { z17.s }, p0, [x24, x23, LSL #2]\n" + "mov z17.d, z13.d\n" + "fmax z16.s, p3/M, z16.s, z15.s\n" + "fmin z16.s, p3/M, z16.s, z14.s\n" + "st1w { z16.s }, p0, [x24, x22, LSL #2]\n" + "mov z16.d, z13.d\n" + "addvl x24, x24, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov p0.b, p2.b\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x21, x2, #0x1\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z27.s, p3/M, z5.s, z9.s\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "add x3, x3, #0x1\n" + "fmla z26.s, p3/M, z4.s, z9.s\n" + "cmp x3, x19\n" + "fmla z25.s, p3/M, z3.s, z9.s\n" + "fmla z23.s, p3/M, z2.s, z9.s\n" + "csel x3, x3, XZR, LT\n" + "fmla z22.s, p3/M, z1.s, z9.s\n" + "csel x2, x2, x21, LT\n" + "fmla z21.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x13, x10, LSL #2]\n" + "cmp x2, x20\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z12.s\n" + "fmla z29.s, p3/M, z7.s, z12.s\n" + "fmla z26.s, p3/M, z5.s, z12.s\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "fmla z22.s, p3/M, z2.s, z12.s\n" + "fmla z21.s, p3/M, z1.s, z12.s\n" + "fmla z20.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x8, x7, LSL #2]\n" + "fmla z19.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x13, x9, LSL #2]\n" + "fmla z16.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x8, x28, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z9.s\n" + "fmla z26.s, p3/M, z7.s, z9.s\n" + "fmla z25.s, p3/M, z6.s, z9.s\n" + "fmla z23.s, p3/M, z5.s, z9.s\n" + "fmla z22.s, p3/M, z4.s, z9.s\n" + "fmla z21.s, p3/M, z3.s, z9.s\n" + "fmla z19.s, p3/M, z2.s, z9.s\n" + "fmla z18.s, p3/M, z1.s, z9.s\n" + "fmla z17.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x15]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x15, x27, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "fmla z28.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12]\n" + "fmla z26.s, p3/M, z8.s, z10.s\n" + "fmla z25.s, p3/M, z7.s, z10.s\n" + "fmla z24.s, p3/M, z6.s, z10.s\n" + "fmla z22.s, p3/M, z5.s, z10.s\n" + "fmla z21.s, p3/M, z4.s, z10.s\n" + "fmla z20.s, p3/M, z3.s, z10.s\n" + "fmla z18.s, p3/M, z2.s, z10.s\n" + "fmla z17.s, p3/M, z1.s, z10.s\n" + "fmla z16.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x15, x10, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "fmla z27.s, p3/M, z0.s, z9.s\n" + "fmla z28.s, p3/M, z5.s, z12.s\n" + "fmla z24.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x15, x9, LSL #2]\n" + "fmla z23.s, p3/M, z6.s, z11.s\n" + "fmla z19.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n" + "fmla z31.s, p3/M, z5.s, z10.s\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "fmla z29.s, p3/M, z3.s, z10.s\n" + "fmla z27.s, p3/M, z2.s, z10.s\n" + "fmla z26.s, p3/M, z1.s, z10.s\n" + "fmla z25.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x14, x7, LSL #2]\n" + "fmla z20.s, p3/M, z8.s, z11.s\n" + "fmla z16.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x7, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "fmla z26.s, p3/M, z2.s, z12.s\n" + "fmla z25.s, p3/M, z1.s, z12.s\n" + "fmla z24.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x14, x28, LSL #2]\n" + "fmla z19.s, p3/M, z7.s, z11.s\n" + "fmla z18.s, p3/M, z6.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x28, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z10.s\n" + "fmla z30.s, p3/M, z6.s, z10.s\n" + "fmla z27.s, p3/M, z4.s, z10.s\n" + "fmla z26.s, p3/M, z3.s, z10.s\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "fmla z22.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x8, x10, LSL #2]\n" + "fmla z17.s, p3/M, z8.s, z11.s\n" + "fmla z16.s, p3/M, z7.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x7, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "fmla z25.s, p3/M, z5.s, z12.s\n" + "fmla z24.s, p3/M, z4.s, z12.s\n" + "fmla z21.s, p3/M, z2.s, z12.s\n" + "fmla z20.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x8, x9, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "fmla z29.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x14]\n" + "fmla z27.s, p3/M, z7.s, z11.s\n" + "fmla z26.s, p3/M, z6.s, z11.s\n" + "fmla z23.s, p3/M, z4.s, z11.s\n" + "fmla z22.s, p3/M, z3.s, z11.s\n" + "fmla z19.s, p3/M, z1.s, z11.s\n" + "fmla z18.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x28, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x14, x27, LSL #2]\n" + "fmla z31.s, p3/M, z6.s, z10.s\n" + "fmla z27.s, p3/M, z3.s, z10.s\n" + "fmla z23.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x13]\n" + "fmla z25.s, p3/M, z8.s, z11.s\n" + "fmla z24.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z5.s, z11.s\n" + "fmla z20.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z2.s, z11.s\n" + "fmla z16.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x10, LSL #2]\n" + "fmla z28.s, p3/M, z8.s, z12.s\n" + "fmla z24.s, p3/M, z5.s, z12.s\n" + "fmla z20.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n" + "fmla z27.s, p3/M, z6.s, z10.s\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "fmla z19.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11, x10, LSL #2]\n" + "fmla z22.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z6.s, z11.s\n" + "fmla z23.s, p3/M, z8.s, z11.s\n" + "fmla z19.s, p3/M, z5.s, z11.s\n" + "fmla z18.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x9, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z12.s\n" + "fmla z20.s, p3/M, z5.s, z12.s\n" + "fmla z16.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x9, LSL #2]\n" + "fmla z19.s, p3/M, z8.s, z10.s\n" + "fmla z18.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x15, x7, LSL #2]\n" + "fmla z22.s, p3/M, z8.s, z11.s\n" + "fmla z21.s, p3/M, z7.s, z11.s\n" + "fmla z20.s, p3/M, z6.s, z11.s\n" + "fmla z18.s, p3/M, z5.s, z11.s\n" + "fmla z17.s, p3/M, z4.s, z11.s\n" + "fmla z16.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x15, x28, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z10.s\n" + "fmla z18.s, p3/M, z8.s, z12.s\n" + "fmla z17.s, p3/M, z7.s, z12.s\n" + "fmla z16.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x12, x7, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z10.s\n" + "fmla z27.s, p3/M, z1.s, z10.s\n" + "fmla z26.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x12, x28, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z11.s\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "fmla z25.s, p3/M, z2.s, z11.s\n" + "fmla z24.s, p3/M, z1.s, z11.s\n" + "fmla z23.s, p3/M, z7.s, z12.s\n" + "fmla z22.s, p3/M, z6.s, z12.s\n" + "fmla z19.s, p3/M, z4.s, z12.s\n" + "fmla z18.s, p3/M, z3.s, z12.s\n" + "fmla z21.s, p3/M, z8.s, z10.s\n" + "fmla z20.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z5.s, z10.s\n" + "fmla z16.s, p3/M, z4.s, z10.s\n" + "fmax z31.s, p3/M, z31.s, z15.s\n" + "fmax z30.s, p3/M, z30.s, z15.s\n" + "fmax z29.s, p3/M, z29.s, z15.s\n" + "fmax z28.s, p3/M, z28.s, z15.s\n" + "fmin z31.s, p3/M, z31.s, z14.s\n" + "st1w { z31.s }, p0, [x16]\n" + "fmin z30.s, p3/M, z30.s, z14.s\n" + "fmin z29.s, p3/M, z29.s, z14.s\n" + "st1w { z30.s }, p0, [x16, x17, LSL #2]\n" + "fmin z28.s, p3/M, z28.s, z14.s\n" + "fmax z27.s, p3/M, z27.s, z15.s\n" + "st1w { z29.s }, p0, [x16, x23, LSL #2]\n" + "fmax z26.s, p3/M, z26.s, z15.s\n" + "st1w { z28.s }, p0, [x16, x22, LSL #2]\n" + "fmin z27.s, p3/M, z27.s, z14.s\n" + "fmax z25.s, p3/M, z25.s, z15.s\n" + "st1w { z27.s }, p0, [x26]\n" + "fmin z26.s, p3/M, z26.s, z14.s\n" + "fmin z25.s, p3/M, z25.s, z14.s\n" + "st1w { z26.s }, p0, [x26, x17, LSL #2]\n" + "fmax z24.s, p3/M, z24.s, z15.s\n" + "fmax z23.s, p3/M, z23.s, z15.s\n" + "st1w { z25.s }, p0, [x26, x23, LSL #2]\n" + "fmax z22.s, p3/M, z22.s, z15.s\n" + "fmax z21.s, p3/M, z21.s, z15.s\n" + "fmax z20.s, p3/M, z20.s, z15.s\n" + "fmin z24.s, p3/M, z24.s, z14.s\n" + "st1w { z24.s }, p0, [x26, x22, LSL #2]\n" + "fmin z23.s, p3/M, z23.s, z14.s\n" + "fmin z22.s, p3/M, z22.s, z14.s\n" + "st1w { z23.s }, p0, [x25]\n" + "fmin z21.s, p3/M, z21.s, z14.s\n" + "fmin z20.s, p3/M, z20.s, z14.s\n" + "st1w { z22.s }, p0, [x25, x17, LSL #2]\n" + "fmax z19.s, p3/M, z19.s, z15.s\n" + "st1w { z21.s }, p0, [x25, x23, LSL #2]\n" + "fmax z18.s, p3/M, z18.s, z15.s\n" + "fmax z17.s, p3/M, z17.s, z15.s\n" + "st1w { z20.s }, p0, [x25, x22, LSL #2]\n" + "fmin z19.s, p3/M, z19.s, z14.s\n" + "st1w { z19.s }, p0, [x24]\n" + "fmin z18.s, p3/M, z18.s, z14.s\n" + "fmin z17.s, p3/M, z17.s, z14.s\n" + "st1w { z18.s }, p0, [x24, x17, LSL #2]\n" + "fmax z16.s, p3/M, z16.s, z15.s\n" + "st1w { z17.s }, p0, [x24, x23, LSL #2]\n" + "fmin z16.s, p3/M, z16.s, z14.s\n" + "st1w { z16.s }, p0, [x24, x22, LSL #2]\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..ac0c4ec4e3 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[36]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[14]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[5]; + inptrs[3] = input_ptrs[15]; + inptrs[4] = input_ptrs[30]; + inptrs[5] = input_ptrs[35]; + inptrs[6] = input_ptrs[20]; + inptrs[7] = input_ptrs[1]; + inptrs[8] = input_ptrs[4]; + inptrs[9] = input_ptrs[21]; + inptrs[10] = input_ptrs[6]; + inptrs[11] = input_ptrs[11]; + inptrs[12] = input_ptrs[24]; + inptrs[13] = input_ptrs[8]; + inptrs[14] = input_ptrs[29]; + inptrs[15] = input_ptrs[9]; + inptrs[16] = input_ptrs[31]; + inptrs[17] = input_ptrs[13]; + inptrs[18] = input_ptrs[34]; + inptrs[19] = input_ptrs[16]; + inptrs[20] = input_ptrs[2]; + inptrs[21] = input_ptrs[19]; + inptrs[22] = input_ptrs[3]; + inptrs[23] = input_ptrs[12]; + inptrs[24] = input_ptrs[22]; + inptrs[25] = input_ptrs[17]; + inptrs[26] = input_ptrs[18]; + inptrs[27] = input_ptrs[26]; + inptrs[28] = input_ptrs[23]; + inptrs[29] = input_ptrs[32]; + inptrs[30] = input_ptrs[27]; + inptrs[31] = input_ptrs[33]; + inptrs[32] = input_ptrs[7]; + inptrs[33] = input_ptrs[10]; + inptrs[34] = input_ptrs[25]; + inptrs[35] = input_ptrs[28]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x5, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n" + "add x7, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "cntb x8, ALL, MUL #2\n" + "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "mov x17, #0x0\n" + "ld1w { z13.s }, p3/Z, [x6]\n" + "mov z31.d, z13.d\n" + "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n" + "cntw x16\n" + "mov z30.d, z13.d\n" + "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n" + "sub x15, XZR, x16\n" + "mov z29.d, z13.d\n" + "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "mov z28.d, z13.d\n" + "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n" + "cmp x16, %x[n_channels]\n" + "mov z27.d, z13.d\n" + "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n" + "mov z26.d, z13.d\n" + "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n" + "mov z25.d, z13.d\n" + "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n" + "addvl x6, x6, #16\n" + "mov z24.d, z13.d\n" + "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n" + "mov z23.d, z13.d\n" + "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n" + "addvl x6, x6, #-6\n" + "mov z22.d, z13.d\n" + "ldp x14, x13, [x7, #0x0]\n" + "mov z21.d, z13.d\n" + "ldp x12, x11, [x7, #0x10]\n" + "mov z20.d, z13.d\n" + "ld1w { z9.s }, p2/Z, [x14, x17, LSL #2]\n" + "mov z19.d, z13.d\n" + "mov z18.d, z13.d\n" + "prfm pldl1keep, [x14, x8]\n" + "mov z17.d, z13.d\n" + "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n" + "mov z16.d, z13.d\n" + "prfm pldl1keep, [x13, x8]\n" + "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n" + "prfm pldl1keep, [x12, x8]\n" + "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n" + "prfm pldl1keep, [x11, x8]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x22, [x7, #0x20]\n" + "whilelt p1.s, x16, %x[n_channels]\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "ldr x21, [x7, #0x28]\n" + "incw x15\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ldr x20, [x7, #0x30]\n" + "mov p0.b, p2.b\n" + "fmla z27.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x22, x8]\n" + "fmla z26.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x21, x8]\n" + "fmla z25.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x20, x8]\n" + "fmla z23.s, p3/M, z2.s, z9.s\n" + "ldr x19, [x7, #0x38]\n" + "fmla z22.s, p3/M, z1.s, z9.s\n" + "ldr x10, [x7, #0x40]\n" + "fmla z21.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x20, x17, LSL #2]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z12.s\n" + "prfm pldl1keep, [x19, x8]\n" + "fmla z29.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x10, x8]\n" + "fmla z26.s, p3/M, z5.s, z12.s\n" + "ldr x9, [x7, #0x48]\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "ldr x28, [x7, #0x50]\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "ldr x27, [x7, #0x58]\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x9, x8]\n" + "fmla z22.s, p3/M, z2.s, z12.s\n" + "prfm pldl1keep, [x28, x8]\n" + "fmla z21.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x27, x8]\n" + "fmla z20.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x19, x17, LSL #2]\n" + "fmla z19.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x9, x17, LSL #2]\n" + "fmla z16.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z9.s\n" + "ldr x26, [x7, #0x60]\n" + "fmla z26.s, p3/M, z7.s, z9.s\n" + "ldr x25, [x7, #0x68]\n" + "fmla z25.s, p3/M, z6.s, z9.s\n" + "ldr x24, [x7, #0x70]\n" + "fmla z23.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x26, x8]\n" + "fmla z22.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x25, x8]\n" + "fmla z21.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x24, x8]\n" + "fmla z19.s, p3/M, z2.s, z9.s\n" + "ldr x23, [x7, #0x78]\n" + "fmla z18.s, p3/M, z1.s, z9.s\n" + "ldr x14, [x7, #0x80]\n" + "fmla z17.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x28, x17, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x23, x8]\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x17, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x14, x8]\n" + "fmla z28.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x17, LSL #2]\n" + "fmla z26.s, p3/M, z8.s, z10.s\n" + "ldr x13, [x7, #0x88]\n" + "fmla z25.s, p3/M, z7.s, z10.s\n" + "ldr x12, [x7, #0x90]\n" + "fmla z24.s, p3/M, z6.s, z10.s\n" + "ldr x11, [x7, #0x98]\n" + "fmla z22.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x13, x8]\n" + "fmla z21.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x12, x8]\n" + "fmla z20.s, p3/M, z3.s, z10.s\n" + "prfm pldl1keep, [x11, x8]\n" + "fmla z18.s, p3/M, z2.s, z10.s\n" + "ldr x22, [x7, #0xa0]\n" + "fmla z17.s, p3/M, z1.s, z10.s\n" + "ldr x21, [x7, #0xa8]\n" + "fmla z16.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x22, x8]\n" + "fmla z27.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x21, x8]\n" + "fmla z28.s, p3/M, z5.s, z12.s\n" + "ldr x20, [x7, #0xb0]\n" + "fmla z24.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n" + "fmla z23.s, p3/M, z6.s, z11.s\n" + "ldr x19, [x7, #0xb8]\n" + "fmla z19.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n" + "fmla z31.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x20, x8]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x19, x8]\n" + "fmla z29.s, p3/M, z3.s, z10.s\n" + "ldr x10, [x7, #0xc0]\n" + "fmla z27.s, p3/M, z2.s, z10.s\n" + "ldr x9, [x7, #0xc8]\n" + "fmla z26.s, p3/M, z1.s, z10.s\n" + "ldr x28, [x7, #0xd0]\n" + "fmla z25.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n" + "fmla z20.s, p3/M, z8.s, z11.s\n" + "prfm pldl1keep, [x10, x8]\n" + "fmla z16.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x14, x17, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x9, x8]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "prfm pldl1keep, [x28, x8]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ldr x27, [x7, #0xd8]\n" + "fmla z26.s, p3/M, z2.s, z12.s\n" + "ldr x26, [x7, #0xe0]\n" + "fmla z25.s, p3/M, z1.s, z12.s\n" + "ldr x25, [x7, #0xe8]\n" + "fmla z24.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n" + "fmla z19.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x27, x8]\n" + "fmla z18.s, p3/M, z6.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z10.s\n" + "prfm pldl1keep, [x26, x8]\n" + "fmla z30.s, p3/M, z6.s, z10.s\n" + "prfm pldl1keep, [x25, x8]\n" + "fmla z27.s, p3/M, z4.s, z10.s\n" + "ldr x24, [x7, #0xf0]\n" + "fmla z26.s, p3/M, z3.s, z10.s\n" + "ldr x23, [x7, #0xf8]\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "ldr x14, [x7, #0x100]\n" + "fmla z22.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n" + "fmla z17.s, p3/M, z8.s, z11.s\n" + "prfm pldl1keep, [x24, x8]\n" + "fmla z16.s, p3/M, z7.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "prfm pldl1keep, [x23, x8]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x14, x8]\n" + "fmla z25.s, p3/M, z5.s, z12.s\n" + "ldr x13, [x7, #0x108]\n" + "fmla z24.s, p3/M, z4.s, z12.s\n" + "ldr x12, [x7, #0x110]\n" + "fmla z21.s, p3/M, z2.s, z12.s\n" + "ldr x11, [x7, #0x118]\n" + "fmla z20.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x20, x17, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "prfm pldl1keep, [x13, x8]\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "prfm pldl1keep, [x12, x8]\n" + "fmla z29.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x19, x17, LSL #2]\n" + "fmla z27.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x11, x8]\n" + "addvl x8, x8, #1\n" + "fmla z26.s, p3/M, z6.s, z11.s\n" + "ldr x22, [x5, #0x0]\n" + "fmla z23.s, p3/M, z4.s, z11.s\n" + "ldr x21, [x5, #0x8]\n" + "fmla z22.s, p3/M, z3.s, z11.s\n" + "ldr x20, [x5, #0x10]\n" + "fmla z19.s, p3/M, z1.s, z11.s\n" + "ldr x19, [x5, #0x18]\n" + "fmla z18.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "ld1w { z13.s }, p3/Z, [x6]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x9, x17, LSL #2]\n" + "fmla z31.s, p3/M, z6.s, z10.s\n" + "fmla z27.s, p3/M, z3.s, z10.s\n" + "fmla z23.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x28, x17, LSL #2]\n" + "fmla z25.s, p3/M, z8.s, z11.s\n" + "fmla z24.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z5.s, z11.s\n" + "fmla z20.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z2.s, z11.s\n" + "fmla z16.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x27, x17, LSL #2]\n" + "fmla z28.s, p3/M, z8.s, z12.s\n" + "fmla z24.s, p3/M, z5.s, z12.s\n" + "fmla z20.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x26, x17, LSL #2]\n" + "fmla z27.s, p3/M, z6.s, z10.s\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "fmla z19.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n" + "fmla z22.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z6.s, z11.s\n" + "fmla z23.s, p3/M, z8.s, z11.s\n" + "fmla z19.s, p3/M, z5.s, z11.s\n" + "fmla z18.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z12.s\n" + "fmla z20.s, p3/M, z5.s, z12.s\n" + "fmla z16.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n" + "fmla z19.s, p3/M, z8.s, z10.s\n" + "fmla z18.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n" + "fmla z22.s, p3/M, z8.s, z11.s\n" + "fmla z21.s, p3/M, z7.s, z11.s\n" + "fmla z20.s, p3/M, z6.s, z11.s\n" + "fmla z18.s, p3/M, z5.s, z11.s\n" + "fmla z17.s, p3/M, z4.s, z11.s\n" + "fmla z16.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z10.s\n" + "ldp x14, x13, [x7, #0x0]\n" + "fmla z18.s, p3/M, z8.s, z12.s\n" + "ld1w { z9.s }, p1/Z, [x14, x16, LSL #2]\n" + "fmla z17.s, p3/M, z7.s, z12.s\n" + "fmla z16.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z10.s\n" + "prfm pldl1keep, [x14, x8]\n" + "fmla z27.s, p3/M, z1.s, z10.s\n" + "prfm pldl1keep, [x13, x8]\n" + "fmla z26.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11, x17, LSL #2]\n" + "incw x17\n" + "fmla z29.s, p3/M, z5.s, z11.s\n" + "ldp x12, x11, [x7, #0x10]\n" + "whilelt p2.s, x17, %x[n_channels]\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "ld1w { z0.s }, p3/Z, [x6, #1, MUL VL]\n" + "fmla z25.s, p3/M, z2.s, z11.s\n" + "ld1w { z2.s }, p3/Z, [x6, #3, MUL VL]\n" + "fmla z24.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x12, x16, LSL #2]\n" + "fmla z23.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x12, x8]\n" + "fmla z22.s, p3/M, z6.s, z12.s\n" + "prfm pldl1keep, [x11, x8]\n" + "fmla z19.s, p3/M, z4.s, z12.s\n" + "ld1w { z1.s }, p3/Z, [x6, #2, MUL VL]\n" + "fmla z18.s, p3/M, z3.s, z12.s\n" + "ld1w { z12.s }, p1/Z, [x11, x16, LSL #2]\n" + "fmla z21.s, p3/M, z8.s, z10.s\n" + "ld1w { z3.s }, p3/Z, [x6, #4, MUL VL]\n" + "fmla z20.s, p3/M, z7.s, z10.s\n" + "ld1w { z6.s }, p3/Z, [x6, #7, MUL VL]\n" + "fmla z17.s, p3/M, z5.s, z10.s\n" + "ld1w { z5.s }, p3/Z, [x6, #6, MUL VL]\n" + "fmla z16.s, p3/M, z4.s, z10.s\n" + "ld1w { z10.s }, p1/Z, [x13, x16, LSL #2]\n" + "incw x16\n" + "fmax z31.s, p3/M, z31.s, z15.s\n" + "ld1w { z4.s }, p3/Z, [x6, #5, MUL VL]\n" + "addvl x6, x6, #16\n" + "fmax z30.s, p3/M, z30.s, z15.s\n" + "ld1w { z7.s }, p3/Z, [x6, #-8, MUL VL]\n" + "cmp x16, %x[n_channels]\n" + "fmax z29.s, p3/M, z29.s, z15.s\n" + "ld1w { z8.s }, p3/Z, [x6, #-7, MUL VL]\n" + "addvl x6, x6, #-6\n" + "fmax z28.s, p3/M, z28.s, z15.s\n" + "fmax z27.s, p3/M, z27.s, z15.s\n" + "fmin z31.s, p3/M, z31.s, z14.s\n" + "st1w { z31.s }, p0, [x22, x15, LSL #2]\n" + "mov z31.d, z13.d\n" + "fmin z30.s, p3/M, z30.s, z14.s\n" + "ldr x22, [x5, #0x20]\n" + "fmin z29.s, p3/M, z29.s, z14.s\n" + "st1w { z30.s }, p0, [x21, x15, LSL #2]\n" + "mov z30.d, z13.d\n" + "fmin z28.s, p3/M, z28.s, z14.s\n" + "st1w { z29.s }, p0, [x20, x15, LSL #2]\n" + "mov z29.d, z13.d\n" + "ldr x21, [x5, #0x28]\n" + "fmin z27.s, p3/M, z27.s, z14.s\n" + "ldr x20, [x5, #0x30]\n" + "fmax z26.s, p3/M, z26.s, z15.s\n" + "st1w { z28.s }, p0, [x19, x15, LSL #2]\n" + "mov z28.d, z13.d\n" + "ldr x19, [x5, #0x38]\n" + "fmax z25.s, p3/M, z25.s, z15.s\n" + "st1w { z27.s }, p0, [x22, x15, LSL #2]\n" + "mov z27.d, z13.d\n" + "ldr x22, [x5, #0x40]\n" + "fmin z26.s, p3/M, z26.s, z14.s\n" + "st1w { z26.s }, p0, [x21, x15, LSL #2]\n" + "mov z26.d, z13.d\n" + "fmin z25.s, p3/M, z25.s, z14.s\n" + "ldr x21, [x5, #0x48]\n" + "fmax z24.s, p3/M, z24.s, z15.s\n" + "st1w { z25.s }, p0, [x20, x15, LSL #2]\n" + "mov z25.d, z13.d\n" + "fmax z23.s, p3/M, z23.s, z15.s\n" + "ldr x20, [x5, #0x50]\n" + "fmin z24.s, p3/M, z24.s, z14.s\n" + "st1w { z24.s }, p0, [x19, x15, LSL #2]\n" + "mov z24.d, z13.d\n" + "fmin z23.s, p3/M, z23.s, z14.s\n" + "ldr x19, [x5, #0x58]\n" + "fmax z22.s, p3/M, z22.s, z15.s\n" + "st1w { z23.s }, p0, [x22, x15, LSL #2]\n" + "mov z23.d, z13.d\n" + "fmax z21.s, p3/M, z21.s, z15.s\n" + "ldr x22, [x5, #0x60]\n" + "fmin z22.s, p3/M, z22.s, z14.s\n" + "st1w { z22.s }, p0, [x21, x15, LSL #2]\n" + "mov z22.d, z13.d\n" + "fmin z21.s, p3/M, z21.s, z14.s\n" + "ldr x21, [x5, #0x68]\n" + "fmax z20.s, p3/M, z20.s, z15.s\n" + "st1w { z21.s }, p0, [x20, x15, LSL #2]\n" + "mov z21.d, z13.d\n" + "fmax z19.s, p3/M, z19.s, z15.s\n" + "ldr x20, [x5, #0x70]\n" + "fmin z20.s, p3/M, z20.s, z14.s\n" + "st1w { z20.s }, p0, [x19, x15, LSL #2]\n" + "mov z20.d, z13.d\n" + "fmin z19.s, p3/M, z19.s, z14.s\n" + "ldr x19, [x5, #0x78]\n" + "fmax z18.s, p3/M, z18.s, z15.s\n" + "st1w { z19.s }, p0, [x22, x15, LSL #2]\n" + "mov z19.d, z13.d\n" + "fmax z17.s, p3/M, z17.s, z15.s\n" + "fmin z18.s, p3/M, z18.s, z14.s\n" + "st1w { z18.s }, p0, [x21, x15, LSL #2]\n" + "mov z18.d, z13.d\n" + "fmin z17.s, p3/M, z17.s, z14.s\n" + "st1w { z17.s }, p0, [x20, x15, LSL #2]\n" + "mov z17.d, z13.d\n" + "fmax z16.s, p3/M, z16.s, z15.s\n" + "fmin z16.s, p3/M, z16.s, z14.s\n" + "st1w { z16.s }, p0, [x19, x15, LSL #2]\n" + "mov z16.d, z13.d\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x22, [x7, #0x20]\n" + "incw x15\n" + "fmla z30.s, p3/M, z7.s, z9.s\n" + "ldr x21, [x7, #0x28]\n" + "mov p0.b, p2.b\n" + "fmla z29.s, p3/M, z6.s, z9.s\n" + "ldr x20, [x7, #0x30]\n" + "fmla z27.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x22, x8]\n" + "fmla z26.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x21, x8]\n" + "fmla z25.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x20, x8]\n" + "fmla z23.s, p3/M, z2.s, z9.s\n" + "ldr x19, [x7, #0x38]\n" + "fmla z22.s, p3/M, z1.s, z9.s\n" + "ldr x10, [x7, #0x40]\n" + "fmla z21.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x20, x17, LSL #2]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z12.s\n" + "prfm pldl1keep, [x19, x8]\n" + "fmla z29.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x10, x8]\n" + "fmla z26.s, p3/M, z5.s, z12.s\n" + "ldr x9, [x7, #0x48]\n" + "fmla z28.s, p3/M, z6.s, z12.s\n" + "ldr x28, [x7, #0x50]\n" + "fmla z25.s, p3/M, z4.s, z12.s\n" + "ldr x27, [x7, #0x58]\n" + "fmla z24.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x9, x8]\n" + "fmla z22.s, p3/M, z2.s, z12.s\n" + "prfm pldl1keep, [x28, x8]\n" + "fmla z21.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x27, x8]\n" + "fmla z20.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x19, x17, LSL #2]\n" + "fmla z19.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x9, x17, LSL #2]\n" + "fmla z16.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n" + "fmla z27.s, p3/M, z8.s, z9.s\n" + "ldr x26, [x7, #0x60]\n" + "fmla z26.s, p3/M, z7.s, z9.s\n" + "ldr x25, [x7, #0x68]\n" + "fmla z25.s, p3/M, z6.s, z9.s\n" + "ldr x24, [x7, #0x70]\n" + "fmla z23.s, p3/M, z5.s, z9.s\n" + "prfm pldl1keep, [x26, x8]\n" + "fmla z22.s, p3/M, z4.s, z9.s\n" + "prfm pldl1keep, [x25, x8]\n" + "fmla z21.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x24, x8]\n" + "fmla z19.s, p3/M, z2.s, z9.s\n" + "ldr x23, [x7, #0x78]\n" + "fmla z18.s, p3/M, z1.s, z9.s\n" + "ldr x14, [x7, #0x80]\n" + "fmla z17.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x28, x17, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x23, x8]\n" + "fmla z30.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x17, LSL #2]\n" + "fmla z29.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x14, x8]\n" + "fmla z28.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x26, x17, LSL #2]\n" + "fmla z26.s, p3/M, z8.s, z10.s\n" + "ldr x13, [x7, #0x88]\n" + "fmla z25.s, p3/M, z7.s, z10.s\n" + "ldr x12, [x7, #0x90]\n" + "fmla z24.s, p3/M, z6.s, z10.s\n" + "ldr x11, [x7, #0x98]\n" + "fmla z22.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x13, x8]\n" + "fmla z21.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x12, x8]\n" + "fmla z20.s, p3/M, z3.s, z10.s\n" + "prfm pldl1keep, [x11, x8]\n" + "fmla z18.s, p3/M, z2.s, z10.s\n" + "ldr x22, [x7, #0xa0]\n" + "fmla z17.s, p3/M, z1.s, z10.s\n" + "ldr x21, [x7, #0xa8]\n" + "fmla z16.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x22, x8]\n" + "fmla z27.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x21, x8]\n" + "fmla z28.s, p3/M, z5.s, z12.s\n" + "ldr x20, [x7, #0xb0]\n" + "fmla z24.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n" + "fmla z23.s, p3/M, z6.s, z11.s\n" + "ldr x19, [x7, #0xb8]\n" + "fmla z19.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n" + "fmla z31.s, p3/M, z5.s, z10.s\n" + "prfm pldl1keep, [x20, x8]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x19, x8]\n" + "fmla z29.s, p3/M, z3.s, z10.s\n" + "ldr x10, [x7, #0xc0]\n" + "fmla z27.s, p3/M, z2.s, z10.s\n" + "ldr x9, [x7, #0xc8]\n" + "fmla z26.s, p3/M, z1.s, z10.s\n" + "ldr x28, [x7, #0xd0]\n" + "fmla z25.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x13, x17, LSL #2]\n" + "fmla z20.s, p3/M, z8.s, z11.s\n" + "prfm pldl1keep, [x10, x8]\n" + "fmla z16.s, p3/M, z5.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x14, x17, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x9, x8]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "prfm pldl1keep, [x28, x8]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ldr x27, [x7, #0xd8]\n" + "fmla z26.s, p3/M, z2.s, z12.s\n" + "ldr x26, [x7, #0xe0]\n" + "fmla z25.s, p3/M, z1.s, z12.s\n" + "ldr x25, [x7, #0xe8]\n" + "fmla z24.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x17, LSL #2]\n" + "fmla z19.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x27, x8]\n" + "fmla z18.s, p3/M, z6.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x12, x17, LSL #2]\n" + "fmla z31.s, p3/M, z7.s, z10.s\n" + "prfm pldl1keep, [x26, x8]\n" + "fmla z30.s, p3/M, z6.s, z10.s\n" + "prfm pldl1keep, [x25, x8]\n" + "fmla z27.s, p3/M, z4.s, z10.s\n" + "ldr x24, [x7, #0xf0]\n" + "fmla z26.s, p3/M, z3.s, z10.s\n" + "ldr x23, [x7, #0xf8]\n" + "fmla z23.s, p3/M, z1.s, z10.s\n" + "ldr x14, [x7, #0x100]\n" + "fmla z22.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x22, x17, LSL #2]\n" + "fmla z17.s, p3/M, z8.s, z11.s\n" + "prfm pldl1keep, [x24, x8]\n" + "fmla z16.s, p3/M, z7.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x21, x17, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z12.s\n" + "prfm pldl1keep, [x23, x8]\n" + "fmla z28.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x14, x8]\n" + "fmla z25.s, p3/M, z5.s, z12.s\n" + "ldr x13, [x7, #0x108]\n" + "fmla z24.s, p3/M, z4.s, z12.s\n" + "ldr x12, [x7, #0x110]\n" + "fmla z21.s, p3/M, z2.s, z12.s\n" + "ldr x11, [x7, #0x118]\n" + "fmla z20.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x20, x17, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "prfm pldl1keep, [x13, x8]\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "prfm pldl1keep, [x12, x8]\n" + "fmla z29.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x19, x17, LSL #2]\n" + "fmla z27.s, p3/M, z7.s, z11.s\n" + "prfm pldl1keep, [x11, x8]\n" + "fmla z26.s, p3/M, z6.s, z11.s\n" + "ldr x22, [x5, #0x0]\n" + "fmla z23.s, p3/M, z4.s, z11.s\n" + "ldr x21, [x5, #0x8]\n" + "fmla z22.s, p3/M, z3.s, z11.s\n" + "ldr x20, [x5, #0x10]\n" + "fmla z19.s, p3/M, z1.s, z11.s\n" + "ldr x19, [x5, #0x18]\n" + "fmla z18.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x17, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z12.s\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x9, x17, LSL #2]\n" + "fmla z31.s, p3/M, z6.s, z10.s\n" + "fmla z27.s, p3/M, z3.s, z10.s\n" + "fmla z23.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x28, x17, LSL #2]\n" + "fmla z25.s, p3/M, z8.s, z11.s\n" + "fmla z24.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z5.s, z11.s\n" + "fmla z20.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z2.s, z11.s\n" + "fmla z16.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x27, x17, LSL #2]\n" + "fmla z28.s, p3/M, z8.s, z12.s\n" + "fmla z24.s, p3/M, z5.s, z12.s\n" + "fmla z20.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x26, x17, LSL #2]\n" + "fmla z27.s, p3/M, z6.s, z10.s\n" + "fmla z23.s, p3/M, z3.s, z10.s\n" + "fmla z19.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x25, x17, LSL #2]\n" + "fmla z22.s, p3/M, z7.s, z11.s\n" + "fmla z21.s, p3/M, z6.s, z11.s\n" + "fmla z23.s, p3/M, z8.s, z11.s\n" + "fmla z19.s, p3/M, z5.s, z11.s\n" + "fmla z18.s, p3/M, z4.s, z11.s\n" + "fmla z17.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x24, x17, LSL #2]\n" + "fmla z24.s, p3/M, z8.s, z12.s\n" + "fmla z20.s, p3/M, z5.s, z12.s\n" + "fmla z16.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x23, x17, LSL #2]\n" + "fmla z19.s, p3/M, z8.s, z10.s\n" + "fmla z18.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z6.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x14, x17, LSL #2]\n" + "fmla z22.s, p3/M, z8.s, z11.s\n" + "fmla z21.s, p3/M, z7.s, z11.s\n" + "fmla z20.s, p3/M, z6.s, z11.s\n" + "fmla z18.s, p3/M, z5.s, z11.s\n" + "fmla z17.s, p3/M, z4.s, z11.s\n" + "fmla z16.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x13, x17, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z10.s\n" + "fmla z18.s, p3/M, z8.s, z12.s\n" + "fmla z17.s, p3/M, z7.s, z12.s\n" + "fmla z16.s, p3/M, z6.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x12, x17, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z10.s\n" + "fmla z27.s, p3/M, z1.s, z10.s\n" + "fmla z26.s, p3/M, z0.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x11, x17, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z11.s\n" + "fmla z28.s, p3/M, z4.s, z11.s\n" + "fmla z25.s, p3/M, z2.s, z11.s\n" + "fmla z24.s, p3/M, z1.s, z11.s\n" + "fmla z23.s, p3/M, z7.s, z12.s\n" + "fmla z22.s, p3/M, z6.s, z12.s\n" + "fmla z19.s, p3/M, z4.s, z12.s\n" + "fmla z18.s, p3/M, z3.s, z12.s\n" + "fmla z21.s, p3/M, z8.s, z10.s\n" + "fmla z20.s, p3/M, z7.s, z10.s\n" + "fmla z17.s, p3/M, z5.s, z10.s\n" + "fmla z16.s, p3/M, z4.s, z10.s\n" + "fmax z31.s, p3/M, z31.s, z15.s\n" + "fmax z30.s, p3/M, z30.s, z15.s\n" + "fmax z29.s, p3/M, z29.s, z15.s\n" + "fmax z28.s, p3/M, z28.s, z15.s\n" + "fmin z31.s, p3/M, z31.s, z14.s\n" + "st1w { z31.s }, p0, [x22, x15, LSL #2]\n" + "fmin z30.s, p3/M, z30.s, z14.s\n" + "fmin z29.s, p3/M, z29.s, z14.s\n" + "ldr x22, [x5, #0x20]\n" + "fmin z28.s, p3/M, z28.s, z14.s\n" + "st1w { z30.s }, p0, [x21, x15, LSL #2]\n" + "fmax z27.s, p3/M, z27.s, z15.s\n" + "fmax z26.s, p3/M, z26.s, z15.s\n" + "st1w { z29.s }, p0, [x20, x15, LSL #2]\n" + "fmax z25.s, p3/M, z25.s, z15.s\n" + "st1w { z28.s }, p0, [x19, x15, LSL #2]\n" + "fmax z24.s, p3/M, z24.s, z15.s\n" + "ldr x21, [x5, #0x28]\n" + "fmax z23.s, p3/M, z23.s, z15.s\n" + "ldr x20, [x5, #0x30]\n" + "fmin z27.s, p3/M, z27.s, z14.s\n" + "ldr x19, [x5, #0x38]\n" + "fmin z26.s, p3/M, z26.s, z14.s\n" + "st1w { z27.s }, p0, [x22, x15, LSL #2]\n" + "fmin z25.s, p3/M, z25.s, z14.s\n" + "fmin z24.s, p3/M, z24.s, z14.s\n" + "st1w { z26.s }, p0, [x21, x15, LSL #2]\n" + "fmin z23.s, p3/M, z23.s, z14.s\n" + "ldr x22, [x5, #0x40]\n" + "fmax z22.s, p3/M, z22.s, z15.s\n" + "ldr x21, [x5, #0x48]\n" + "fmax z21.s, p3/M, z21.s, z15.s\n" + "st1w { z25.s }, p0, [x20, x15, LSL #2]\n" + "fmax z20.s, p3/M, z20.s, z15.s\n" + "st1w { z24.s }, p0, [x19, x15, LSL #2]\n" + "fmax z19.s, p3/M, z19.s, z15.s\n" + "st1w { z23.s }, p0, [x22, x15, LSL #2]\n" + "fmin z22.s, p3/M, z22.s, z14.s\n" + "ldr x20, [x5, #0x50]\n" + "fmin z21.s, p3/M, z21.s, z14.s\n" + "ldr x19, [x5, #0x58]\n" + "fmin z20.s, p3/M, z20.s, z14.s\n" + "ldr x22, [x5, #0x60]\n" + "fmin z19.s, p3/M, z19.s, z14.s\n" + "st1w { z22.s }, p0, [x21, x15, LSL #2]\n" + "fmax z18.s, p3/M, z18.s, z15.s\n" + "st1w { z21.s }, p0, [x20, x15, LSL #2]\n" + "fmax z17.s, p3/M, z17.s, z15.s\n" + "st1w { z20.s }, p0, [x19, x15, LSL #2]\n" + "fmax z16.s, p3/M, z16.s, z15.s\n" + "st1w { z19.s }, p0, [x22, x15, LSL #2]\n" + "ldr x21, [x5, #0x68]\n" + "fmin z18.s, p3/M, z18.s, z14.s\n" + "ldr x20, [x5, #0x70]\n" + "fmin z17.s, p3/M, z17.s, z14.s\n" + "ldr x19, [x5, #0x78]\n" + "fmin z16.s, p3/M, z16.s, z14.s\n" + "st1w { z18.s }, p0, [x21, x15, LSL #2]\n" + "st1w { z17.s }, p0, [x20, x15, LSL #2]\n" + "st1w { z16.s }, p0, [x19, x15, LSL #2]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..f5b6a4f8ff --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + indirect_kern_type indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + + sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..ad53872630 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x5, #0x0\n" + "mov x6, #0x0\n" + "1:" // Tile loop + "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x21, #0x4\n" + "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "cntb x7\n" + "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n" + "add x7, x7, XZR, LSL #4\n" + "ldr x20, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cntb x17\n" + "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "cntb x15\n" + "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x5, x20\n" // offset = tile_i * ld_input_row + "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x6, x16, x19\n" // offset += tile_j * ld_input_col + "ldr x13, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x21\n" // offset *= kernel_stride * output_size + "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x14, x14, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x11, x14, x20, LSL #2\n" + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x10, x11, x20, LSL #2\n" + "ld1w { z17.s }, p3/Z, [x8]\n" + "mov z31.d, z17.d\n" + "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n" + "add x9, x10, x20, LSL #2\n" + "mov z30.d, z17.d\n" + "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n" + "add x28, x9, x20, LSL #2\n" + "mov z29.d, z17.d\n" + "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n" + "add x27, x16, x16\n" + "mov z28.d, z17.d\n" + "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n" + "add x26, x27, x16\n" + "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n" + "add x25, x26, x16\n" + "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n" + "add x17, x17, x16, LSL #4\n" + "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n" + "add x15, x15, x27, LSL #4\n" + "cntb x24\n" + "prfm pldl1keep, [x10, x15]\n" + "prfm pldl1keep, [x14, x7]\n" + "add x24, x24, x26, LSL #4\n" + "prfm pldl1keep, [x14, x17]\n" + "cntb x23\n" + "prfm pldl1keep, [x14, x24]\n" + "add x23, x23, x25, LSL #4\n" + "mov x20, #0x2\n" + "prfm pldl1keep, [x14, x23]\n" + "prfm pldl1keep, [x11, x7]\n" + "mul x19, x5, x22\n" // offset = tile_i * ld_output_row + "prfm pldl1keep, [x11, x17]\n" + "madd x19, x6, x13, x19\n" // offset += tile_j * ld_output_col + "prfm pldl1keep, [x14, x15]\n" + "mul x19, x19, x20\n" // offset *= output_tile_size + "mov x21, #0x0\n" + "add x12, x12, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x22, x12, x22, LSL #2\n" + "cntw x20\n" + "sub x19, XZR, x20\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "ld1w { z9.s }, p2/Z, [x10, x27, LSL #2]\n" + "ld1w { z10.s }, p2/Z, [x14]\n" + "addvl x8, x8, #16\n" + "ld1w { z11.s }, p2/Z, [x14, x16, LSL #2]\n" + "cmp x20, %x[n_channels]\n" + "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n" + "addvl x8, x8, #-6\n" + "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n" + "ld1w { z13.s }, p2/Z, [x14, x25, LSL #2]\n" + "ld1w { z14.s }, p2/Z, [x11]\n" + "ld1w { z15.s }, p2/Z, [x11, x16, LSL #2]\n" + "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.s, p3/M, z8.s, z9.s\n" + "prfm pldl1keep, [x11, x24]\n" + "whilelt p1.s, x20, %x[n_channels]\n" + "fmla z30.s, p3/M, z6.s, z9.s\n" + "prfm pldl1keep, [x11, x23]\n" + "incw x19\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "prfm pldl1keep, [x11, x15]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x9, x7]\n" + "incw x21\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x10, x7]\n" + "addvl x14, x14, #1\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x25, LSL #2]\n" + "incw x20\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n" + "prfm pldl1keep, [x9, x17]\n" + "fmla z30.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x9]\n" + "addvl x11, x11, #1\n" + "fmla z30.s, p3/M, z0.s, z16.s\n" + "prfm pldl1keep, [x10, x17]\n" + "prfm pldl1keep, [x9, x24]\n" + "fmla z29.s, p3/M, z3.s, z14.s\n" + "prfm pldl1keep, [x10, x24]\n" + "ld1w { z14.s }, p2/Z, [x9, x25, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z15.s\n" + "ld1w { z15.s }, p2/Z, [x10]\n" + "fmla z30.s, p3/M, z4.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x9, x16, LSL #2]\n" + "fmla z29.s, p3/M, z0.s, z15.s\n" + "prfm pldl1keep, [x9, x23]\n" + "prfm pldl1keep, [x28, x7]\n" + "fmla z31.s, p3/M, z2.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x10, x16, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n" + "prfm pldl1keep, [x10, x23]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "addvl x10, x10, #1\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n" + "prfm pldl1keep, [x28, x17]\n" + "fmla z29.s, p3/M, z1.s, z16.s\n" + "prfm pldl1keep, [x9, x15]\n" + "prfm pldl1keep, [x28, x24]\n" + "fmla z31.s, p3/M, z6.s, z15.s\n" + "fmla z28.s, p3/M, z4.s, z13.s\n" + "ld1w { z15.s }, p2/Z, [x28]\n" + "fmla z30.s, p3/M, z7.s, z12.s\n" + "ld1w { z13.s }, p2/Z, [x28, x16, LSL #2]\n" + "prfm pldl1keep, [x28, x15]\n" + "fmla z31.s, p3/M, z7.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x9, x27, LSL #2]\n" + "addvl x9, x9, #1\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x28, x23]\n" + "fmla z29.s, p3/M, z6.s, z15.s\n" + "ld1w { z15.s }, p2/Z, [x28, x27, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z11.s\n" + "ld1w { z9.s }, p1/Z, [x10, x27, LSL #2]\n" + "prfm pldl1keep, [x10, x15]\n" + "fmax z31.s, p3/M, z31.s, z19.s\n" + "ld1w { z10.s }, p1/Z, [x14]\n" + "fmla z28.s, p3/M, z5.s, z14.s\n" + "fmla z29.s, p3/M, z7.s, z13.s\n" + "ld1w { z14.s }, p2/Z, [x28, x26, LSL #2]\n" + "fmax z30.s, p3/M, z30.s, z19.s\n" + "prfm pldl1keep, [x14, x7]\n" + "prfm pldl1keep, [x14, x17]\n" + "fmin z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "fmla z29.s, p3/M, z5.s, z16.s\n" + "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n" + "whilelt p2.s, x21, %x[n_channels]\n" + "fmin z30.s, p3/M, z30.s, z18.s\n" + "prfm pldl1keep, [x14, x24]\n" + "addvl x28, x28, #1\n" + "fmla z28.s, p3/M, z3.s, z16.s\n" + "ld1w { z13.s }, p1/Z, [x14, x25, LSL #2]\n" + "cmp x20, %x[n_channels]\n" + "fmla z29.s, p3/M, z8.s, z15.s\n" + "prfm pldl1keep, [x14, x23]\n" + "prfm pldl1keep, [x11, x7]\n" + "fmla z28.s, p3/M, z7.s, z14.s\n" + "ld1w { z14.s }, p1/Z, [x11]\n" + "prfm pldl1keep, [x11, x17]\n" + "fmax z29.s, p3/M, z29.s, z19.s\n" + "ld1w { z16.s }, p1/Z, [x14, x27, LSL #2]\n" + "fmla z28.s, p3/M, z6.s, z15.s\n" + "ld1w { z15.s }, p1/Z, [x11, x16, LSL #2]\n" + "prfm pldl1keep, [x14, x15]\n" + "fmin z29.s, p3/M, z29.s, z18.s\n" + "st1w { z31.s }, p0, [x12]\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x14, x16, LSL #2]\n" + "st1w { z30.s }, p0, [x12, x13, LSL #2]\n" + "fmax z28.s, p3/M, z28.s, z19.s\n" + "st1w { z29.s }, p0, [x22]\n" + "addvl x12, x12, #1\n" + "fmin z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z17.s }, p3/Z, [x8]\n" + "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n" + "mov z31.d, z17.d\n" + "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n" + "mov z30.d, z17.d\n" + "st1w { z28.s }, p0, [x22, x13, LSL #2]\n" + "addvl x22, x22, #1\n" + "mov z29.d, z17.d\n" + "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n" + "mov z28.d, z17.d\n" + "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n" + "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n" + "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n" + "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n" + "addvl x8, x8, #16\n" + "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n" + "addvl x8, x8, #-6\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.s, p3/M, z8.s, z9.s\n" + "prfm pldl1keep, [x11, x24]\n" + "mov p0.b, p2.b\n" + "fmla z30.s, p3/M, z6.s, z9.s\n" + "prfm pldl1keep, [x11, x23]\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "prfm pldl1keep, [x11, x15]\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x9, x7]\n" + "prfm pldl1keep, [x10, x7]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x9, x17]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x11, x25, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x11, x26, LSL #2]\n" + "prfm pldl1keep, [x10, x17]\n" + "fmla z30.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x11, x27, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x9]\n" + "prfm pldl1keep, [x9, x24]\n" + "fmla z30.s, p3/M, z0.s, z16.s\n" + "prfm pldl1keep, [x10, x24]\n" + "fmla z29.s, p3/M, z3.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x9, x25, LSL #2]\n" + "prfm pldl1keep, [x9, x23]\n" + "fmla z31.s, p3/M, z4.s, z15.s\n" + "ld1w { z15.s }, p2/Z, [x10]\n" + "fmla z30.s, p3/M, z4.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x9, x16, LSL #2]\n" + "fmla z29.s, p3/M, z0.s, z15.s\n" + "prfm pldl1keep, [x28, x7]\n" + "prfm pldl1keep, [x10, x23]\n" + "fmla z31.s, p3/M, z2.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x10, x16, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x25, LSL #2]\n" + "prfm pldl1keep, [x28, x17]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "prfm pldl1keep, [x9, x15]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x9, x26, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z16.s\n" + "prfm pldl1keep, [x28, x24]\n" + "prfm pldl1keep, [x28, x15]\n" + "fmla z31.s, p3/M, z6.s, z15.s\n" + "fmla z28.s, p3/M, z4.s, z13.s\n" + "ld1w { z15.s }, p2/Z, [x28]\n" + "fmla z30.s, p3/M, z7.s, z12.s\n" + "ld1w { z13.s }, p2/Z, [x28, x16, LSL #2]\n" + "prfm pldl1keep, [x28, x23]\n" + "fmla z31.s, p3/M, z7.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x9, x27, LSL #2]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x5, #0x1\n" + "fmla z28.s, p3/M, z5.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x28, x26, LSL #2]\n" + "fmla z29.s, p3/M, z6.s, z15.s\n" + "ld1w { z15.s }, p2/Z, [x28, x27, LSL #2]\n" + "fmla z30.s, p3/M, z8.s, z11.s\n" + "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "add x6, x6, #0x1\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x28, x25, LSL #2]\n" + "fmla z29.s, p3/M, z7.s, z13.s\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmax z31.s, p3/M, z31.s, z19.s\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x6, x19\n" + "fmla z29.s, p3/M, z5.s, z16.s\n" + "fmla z28.s, p3/M, z3.s, z16.s\n" + "csel x6, x6, XZR, LT\n" + "fmin z31.s, p3/M, z31.s, z18.s\n" + "st1w { z31.s }, p0, [x12]\n" + "fmla z28.s, p3/M, z7.s, z14.s\n" + "csel x5, x5, x21, LT\n" + "fmla z29.s, p3/M, z8.s, z15.s\n" + "cmp x5, x20\n" + "fmax z30.s, p3/M, z30.s, z19.s\n" + "fmla z28.s, p3/M, z6.s, z15.s\n" + "fmin z30.s, p3/M, z30.s, z18.s\n" + "st1w { z30.s }, p0, [x12, x13, LSL #2]\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "fmax z29.s, p3/M, z29.s, z19.s\n" + "fmin z29.s, p3/M, z29.s, z18.s\n" + "st1w { z29.s }, p0, [x22]\n" + "fmax z28.s, p3/M, z28.s, z19.s\n" + "fmin z28.s, p3/M, z28.s, z18.s\n" + "st1w { z28.s }, p0, [x22, x13, LSL #2]\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..06b3575d4b --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[25]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[12]; + inptrs[1] = input_ptrs[0]; + inptrs[2] = input_ptrs[1]; + inptrs[3] = input_ptrs[3]; + inptrs[4] = input_ptrs[4]; + inptrs[5] = input_ptrs[5]; + inptrs[6] = input_ptrs[6]; + inptrs[7] = input_ptrs[2]; + inptrs[8] = input_ptrs[8]; + inptrs[9] = input_ptrs[9]; + inptrs[10] = input_ptrs[7]; + inptrs[11] = input_ptrs[15]; + inptrs[12] = input_ptrs[10]; + inptrs[13] = input_ptrs[16]; + inptrs[14] = input_ptrs[11]; + inptrs[15] = input_ptrs[18]; + inptrs[16] = input_ptrs[13]; + inptrs[17] = input_ptrs[19]; + inptrs[18] = input_ptrs[20]; + inptrs[19] = input_ptrs[14]; + inptrs[20] = input_ptrs[21]; + inptrs[21] = input_ptrs[17]; + inptrs[22] = input_ptrs[23]; + inptrs[23] = input_ptrs[22]; + inptrs[24] = input_ptrs[24]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n" + "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "cntb x12, ALL, MUL #2\n" + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "mov x11, #0x0\n" + "ldp x10, x9, [x19, #0x0]\n" + "cntw x28\n" + "ldp x27, x26, [x19, #0x10]\n" + "sub x25, XZR, x28\n" + "ld1w { z17.s }, p3/Z, [x14]\n" + "mov z31.d, z17.d\n" + "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "mov z30.d, z17.d\n" + "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n" + "cmp x28, %x[n_channels]\n" + "mov z29.d, z17.d\n" + "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n" + "mov z28.d, z17.d\n" + "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n" + "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n" + "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n" + "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "ldp x21, x20, [x13, #0x0]\n" + "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n" + "addvl x14, x14, #-6\n" + "ld1w { z9.s }, p2/Z, [x21, x11, LSL #2]\n" + "prfm pldl1keep, [x21, x12]\n" + "ld1w { z10.s }, p2/Z, [x20, x11, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "ldp x24, x23, [x13, #0x10]\n" + "ldp x22, x21, [x13, #0x20]\n" + "ldp x20, x19, [x13, #0x30]\n" + "ld1w { z11.s }, p2/Z, [x24, x11, LSL #2]\n" + "prfm pldl1keep, [x24, x12]\n" + "ld1w { z12.s }, p2/Z, [x23, x11, LSL #2]\n" + "prfm pldl1keep, [x23, x12]\n" + "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n" + "prfm pldl1keep, [x22, x12]\n" + "ld1w { z14.s }, p2/Z, [x21, x11, LSL #2]\n" + "prfm pldl1keep, [x21, x12]\n" + "ld1w { z15.s }, p2/Z, [x20, x11, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x22, [x13, #0x40]\n" + "whilelt p1.s, x28, %x[n_channels]\n" + "fmla z30.s, p3/M, z6.s, z9.s\n" + "ldr x21, [x13, #0x48]\n" + "incw x25\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "ldr x20, [x13, #0x50]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x22, x12]\n" + "ldr x19, [x13, #0x58]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x21, x12]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "fmla z30.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x20, x11, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z30.s, p3/M, z0.s, z16.s\n" + "ldr x21, [x13, #0x60]\n" + "fmla z29.s, p3/M, z3.s, z14.s\n" + "ldr x20, [x13, #0x68]\n" + "ldr x19, [x13, #0x70]\n" + "fmla z31.s, p3/M, z4.s, z15.s\n" + "ld1w { z15.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z11.s\n" + "prfm pldl1keep, [x21, x12]\n" + "fmla z29.s, p3/M, z0.s, z15.s\n" + "ld1w { z11.s }, p2/Z, [x20, x11, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "fmla z31.s, p3/M, z2.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "ldr x19, [x13, #0x78]\n" + "ldr x21, [x13, #0x80]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "ldr x20, [x13, #0x88]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "ldr x24, [x13, #0x90]\n" + "fmla z29.s, p3/M, z1.s, z16.s\n" + "ld1w { z13.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z31.s, p3/M, z6.s, z15.s\n" + "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z28.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x21, x12]\n" + "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n" + "fmla z30.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x20, x12]\n" + "fmla z31.s, p3/M, z7.s, z16.s\n" + "ld1w { z15.s }, p2/Z, [x24, x11, LSL #2]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x24, x12]\n" + "ldr x23, [x13, #0x98]\n" + "fmla z29.s, p3/M, z6.s, z15.s\n" + "ldr x22, [x13, #0xa0]\n" + "fmax z31.s, p3/M, z31.s, z19.s\n" + "ldr x21, [x13, #0xa8]\n" + "fmla z28.s, p3/M, z5.s, z14.s\n" + "ld1w { z11.s }, p2/Z, [x23, x11, LSL #2]\n" + "prfm pldl1keep, [x23, x12]\n" + "fmin z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n" + "prfm pldl1keep, [x22, x12]\n" + "fmla z30.s, p3/M, z8.s, z11.s\n" + "ld1w { z16.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x21, x12]\n" + "fmla z29.s, p3/M, z7.s, z13.s\n" + "ldr x20, [x13, #0xb0]\n" + "fmax z30.s, p3/M, z30.s, z19.s\n" + "ldr x19, [x13, #0xb8]\n" + "ldr x22, [x13, #0xc0]\n" + "fmla z28.s, p3/M, z3.s, z16.s\n" + "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z16.s\n" + "prfm pldl1keep, [x20, x12]\n" + "fmin z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z15.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z28.s, p3/M, z7.s, z14.s\n" + "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z15.s\n" + "prfm pldl1keep, [x22, x12]\n" + "incw x11\n" + "fmla z28.s, p3/M, z6.s, z15.s\n" + "ldp x21, x20, [x13, #0x0]\n" + "whilelt p2.s, x11, %x[n_channels]\n" + "fmax z29.s, p3/M, z29.s, z19.s\n" + "ldp x24, x23, [x13, #0x10]\n" + "addvl x12, x12, #1\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "ld1w { z9.s }, p1/Z, [x21, x28, LSL #2]\n" + "prfm pldl1keep, [x21, x12]\n" + "fmin z29.s, p3/M, z29.s, z18.s\n" + "ld1w { z10.s }, p1/Z, [x20, x28, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "fmax z28.s, p3/M, z28.s, z19.s\n" + "ld1w { z11.s }, p1/Z, [x24, x28, LSL #2]\n" + "prfm pldl1keep, [x24, x12]\n" + "fmin z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z12.s }, p1/Z, [x23, x28, LSL #2]\n" + "prfm pldl1keep, [x23, x12]\n" + "ldp x22, x21, [x13, #0x20]\n" + "ldp x20, x19, [x13, #0x30]\n" + "st1w { z31.s }, p0, [x10, x25, LSL #2]\n" + "ld1w { z13.s }, p1/Z, [x22, x28, LSL #2]\n" + "prfm pldl1keep, [x22, x12]\n" + "ld1w { z14.s }, p1/Z, [x21, x28, LSL #2]\n" + "prfm pldl1keep, [x21, x12]\n" + "ld1w { z15.s }, p1/Z, [x20, x28, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "ld1w { z16.s }, p1/Z, [x19, x28, LSL #2]\n" + "incw x28\n" + "prfm pldl1keep, [x19, x12]\n" + "cmp x28, %x[n_channels]\n" + "st1w { z30.s }, p0, [x9, x25, LSL #2]\n" + "st1w { z29.s }, p0, [x27, x25, LSL #2]\n" + "st1w { z28.s }, p0, [x26, x25, LSL #2]\n" + "ld1w { z17.s }, p3/Z, [x14]\n" + "mov z31.d, z17.d\n" + "ld1w { z0.s }, p3/Z, [x14, #1, MUL VL]\n" + "mov z30.d, z17.d\n" + "ld1w { z1.s }, p3/Z, [x14, #2, MUL VL]\n" + "mov z29.d, z17.d\n" + "ld1w { z2.s }, p3/Z, [x14, #3, MUL VL]\n" + "mov z28.d, z17.d\n" + "ld1w { z3.s }, p3/Z, [x14, #4, MUL VL]\n" + "ld1w { z4.s }, p3/Z, [x14, #5, MUL VL]\n" + "ld1w { z5.s }, p3/Z, [x14, #6, MUL VL]\n" + "ld1w { z6.s }, p3/Z, [x14, #7, MUL VL]\n" + "addvl x14, x14, #16\n" + "ld1w { z7.s }, p3/Z, [x14, #-8, MUL VL]\n" + "ld1w { z8.s }, p3/Z, [x14, #-7, MUL VL]\n" + "addvl x14, x14, #-6\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.s, p3/M, z8.s, z9.s\n" + "ldr x22, [x13, #0x40]\n" + "incw x25\n" + "fmla z30.s, p3/M, z6.s, z9.s\n" + "ldr x21, [x13, #0x48]\n" + "mov p0.b, p2.b\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "ldr x20, [x13, #0x50]\n" + "fmla z28.s, p3/M, z0.s, z9.s\n" + "prfm pldl1keep, [x22, x12]\n" + "ldr x19, [x13, #0x58]\n" + "fmla z31.s, p3/M, z0.s, z10.s\n" + "prfm pldl1keep, [x21, x12]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "fmla z30.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x20, x11, LSL #2]\n" + "fmla z31.s, p3/M, z3.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z30.s, p3/M, z0.s, z16.s\n" + "ldr x21, [x13, #0x60]\n" + "fmla z29.s, p3/M, z3.s, z14.s\n" + "ldr x20, [x13, #0x68]\n" + "ldr x19, [x13, #0x70]\n" + "fmla z31.s, p3/M, z4.s, z15.s\n" + "ld1w { z15.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z11.s\n" + "prfm pldl1keep, [x21, x12]\n" + "fmla z29.s, p3/M, z0.s, z15.s\n" + "ld1w { z11.s }, p2/Z, [x20, x11, LSL #2]\n" + "prfm pldl1keep, [x20, x12]\n" + "fmla z31.s, p3/M, z2.s, z16.s\n" + "ld1w { z16.s }, p2/Z, [x19, x11, LSL #2]\n" + "fmla z30.s, p3/M, z5.s, z12.s\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z29.s, p3/M, z4.s, z11.s\n" + "ldr x19, [x13, #0x78]\n" + "ldr x21, [x13, #0x80]\n" + "fmla z31.s, p3/M, z5.s, z13.s\n" + "ldr x20, [x13, #0x88]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "ldr x24, [x13, #0x90]\n" + "fmla z29.s, p3/M, z1.s, z16.s\n" + "ld1w { z13.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z31.s, p3/M, z6.s, z15.s\n" + "ld1w { z12.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z28.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x21, x12]\n" + "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n" + "fmla z30.s, p3/M, z7.s, z12.s\n" + "prfm pldl1keep, [x20, x12]\n" + "fmla z31.s, p3/M, z7.s, z16.s\n" + "ld1w { z15.s }, p2/Z, [x24, x11, LSL #2]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x24, x12]\n" + "ldr x23, [x13, #0x98]\n" + "fmla z29.s, p3/M, z6.s, z15.s\n" + "ldr x22, [x13, #0xa0]\n" + "fmax z31.s, p3/M, z31.s, z19.s\n" + "ldr x21, [x13, #0xa8]\n" + "fmla z28.s, p3/M, z5.s, z14.s\n" + "ld1w { z11.s }, p2/Z, [x23, x11, LSL #2]\n" + "prfm pldl1keep, [x23, x12]\n" + "fmin z31.s, p3/M, z31.s, z18.s\n" + "ld1w { z13.s }, p2/Z, [x22, x11, LSL #2]\n" + "prfm pldl1keep, [x22, x12]\n" + "fmla z30.s, p3/M, z8.s, z11.s\n" + "ld1w { z16.s }, p2/Z, [x21, x11, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x21, x12]\n" + "fmla z29.s, p3/M, z7.s, z13.s\n" + "ldr x20, [x13, #0xb0]\n" + "fmax z30.s, p3/M, z30.s, z19.s\n" + "ldr x19, [x13, #0xb8]\n" + "ldr x22, [x13, #0xc0]\n" + "fmla z28.s, p3/M, z3.s, z16.s\n" + "ld1w { z14.s }, p2/Z, [x20, x11, LSL #2]\n" + "fmla z29.s, p3/M, z5.s, z16.s\n" + "prfm pldl1keep, [x20, x12]\n" + "fmin z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z15.s }, p2/Z, [x19, x11, LSL #2]\n" + "prfm pldl1keep, [x19, x12]\n" + "fmla z28.s, p3/M, z7.s, z14.s\n" + "ld1w { z11.s }, p2/Z, [x22, x11, LSL #2]\n" + "fmla z29.s, p3/M, z8.s, z15.s\n" + "prfm pldl1keep, [x22, x12]\n" + "st1w { z31.s }, p0, [x10, x25, LSL #2]\n" + "fmla z28.s, p3/M, z6.s, z15.s\n" + "st1w { z30.s }, p0, [x9, x25, LSL #2]\n" + "fmax z29.s, p3/M, z29.s, z19.s\n" + "fmla z28.s, p3/M, z8.s, z11.s\n" + "fmin z29.s, p3/M, z29.s, z18.s\n" + "st1w { z29.s }, p0, [x27, x25, LSL #2]\n" + "fmax z28.s, p3/M, z28.s, z19.s\n" + "fmin z28.s, p3/M, z28.s, z18.s\n" + "st1w { z28.s }, p0, [x26, x25, LSL #2]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..d49f7fdceb --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); +void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + +struct sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); + typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + indirect_kern_type indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + direct_kern_type direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + + sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp new file mode 100644 index 0000000000..f751186dce --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + const uint64_t n_tile_rows, n_tile_cols; + const float *inptr; + const uint64_t ld_input_row; + const uint64_t ld_input_col; + float *outptr; + const uint64_t ld_output_row; + const uint64_t ld_output_col; + const void *params; + const float min, max; + + uint64_t tile_i = 0, tile_j = 0; + + Args( + const unsigned int n_tile_rows, + const unsigned int n_tile_cols, + const float *inptr, + int64_t ld_input_row, + int64_t ld_input_col, + float *outptr, + int64_t ld_output_row, + int64_t ld_output_col, + const void *params, + const float activation_min, + const float activation_max + ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr), + ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr), + ld_output_row(ld_output_row), ld_output_col(ld_output_col), + params(params), min(activation_min), max(activation_max) + { + } + }; + + Args params_struct( + n_tile_rows, n_tile_cols, + inptr, ld_input_row, ld_input_col, + outptr, ld_output_row, ld_output_col, + params, activation_min, activation_max + ); + + __asm__ __volatile__( + "ptrue p3.b\n" + "mov x5, #0x0\n" + "mov x6, #0x0\n" + "1:" // Tile loop + "str x5, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "mov x20, #0x2\n" + "str x6, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "mov x7, #0x2\n" + "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n" + "mov x17, #0x0\n" + "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n" + "cntw x16\n" + "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n" + "sub x14, XZR, x16\n" + "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n" + "mul x19, x5, x22\n" // offset = tile_i * ld_input_row + "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n" + "madd x19, x6, x15, x19\n" // offset += tile_j * ld_input_col + "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n" + "mul x19, x19, x20\n" // offset *= kernel_stride * output_size + "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n" + "add x13, x13, x19, LSL #2\n" // inptr[0] += offset * sizeof(float) + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "add x20, x13, x22, LSL #2\n" + "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "add x10, x20, x22, LSL #2\n" + "ld1w { z16.s }, p3/Z, [x8]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n" + "add x9, x10, x22, LSL #2\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n" + "add x28, x9, x22, LSL #2\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n" + "add x27, x28, x22, LSL #2\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n" + "add x26, x15, x15\n" + "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n" + "add x25, x26, x15\n" + "mul x19, x5, x21\n" // offset = tile_i * ld_output_row + "add x24, x25, x15\n" + "add x23, x24, x15\n" + "madd x19, x6, x12, x19\n" // offset += tile_j * ld_output_col + "mul x19, x19, x7\n" // offset *= output_tile_size + "add x11, x11, x19, LSL #2\n" // outptrs[0] += offset * sizeof(float) + "add x22, x11, x21, LSL #2\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "ld1w { z5.s }, p2/Z, [x13]\n" + "ld1w { z6.s }, p2/Z, [x13, x15, LSL #2]\n" + "cmp x16, %x[n_channels]\n" + "ld1w { z7.s }, p2/Z, [x20]\n" + "addvl x8, x8, #6\n" + "ld1w { z8.s }, p2/Z, [x20, x15, LSL #2]\n" + "ld1w { z9.s }, p2/Z, [x13, x26, LSL #2]\n" + "ld1w { z13.s }, p2/Z, [x20, x26, LSL #2]\n" + "ld1w { z11.s }, p2/Z, [x13, x25, LSL #2]\n" + "ld1w { z12.s }, p2/Z, [x13, x24, LSL #2]\n" + "ld1w { z10.s }, p2/Z, [x20, x23, LSL #2]\n" + "ld1w { z14.s }, p2/Z, [x10]\n" + "bge 3f\n" + "2:" // Tile loop: Channel loop + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n" + "whilelt p1.s, x16, %x[n_channels]\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "incw x14\n" + "fmla z29.s, p3/M, z0.s, z7.s\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z0.s, z8.s\n" + "ld1w { z0.s }, p3/Z, [x8]\n" + "incw x17\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n" + "addvl x20, x20, #1\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "incw x16\n" + "fmla z29.s, p3/M, z1.s, z8.s\n" + "fmla z28.s, p3/M, z1.s, z13.s\n" + "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n" + "addvl x13, x13, #1\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "fmla z29.s, p3/M, z2.s, z13.s\n" + "fmla z28.s, p3/M, z2.s, z5.s\n" + "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "fmla z29.s, p3/M, z3.s, z5.s\n" + "fmla z28.s, p3/M, z3.s, z6.s\n" + "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z6.s\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z7.s\n" + "ld1w { z7.s }, p1/Z, [x20]\n" + "fmla z30.s, p3/M, z0.s, z8.s\n" + "fmla z29.s, p3/M, z0.s, z14.s\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z8.s\n" + "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z13.s\n" + "fmla z29.s, p3/M, z1.s, z11.s\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n" + "addvl x10, x10, #1\n" + "fmla z30.s, p3/M, z2.s, z5.s\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "fmla z28.s, p3/M, z2.s, z9.s\n" + "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n" + "addvl x8, x8, #16\n" + "fmla z31.s, p3/M, z3.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x9]\n" + "ld1w { z16.s }, p3/Z, [x8, #4, MUL VL]\n" + "fmla z30.s, p3/M, z3.s, z6.s\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "fmla z28.s, p3/M, z4.s, z8.s\n" + "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z11.s\n" + "fmla z29.s, p3/M, z0.s, z5.s\n" + "fmla z28.s, p3/M, z0.s, z6.s\n" + "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "fmla z29.s, p3/M, z1.s, z6.s\n" + "fmla z28.s, p3/M, z1.s, z10.s\n" + "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n" + "addvl x9, x9, #1\n" + "fmla z30.s, p3/M, z2.s, z9.s\n" + "fmla z29.s, p3/M, z2.s, z10.s\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x28]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z8.s\n" + "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "fmla z28.s, p3/M, z4.s, z14.s\n" + "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "fmla z29.s, p3/M, z0.s, z9.s\n" + "fmla z28.s, p3/M, z0.s, z13.s\n" + "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "fmla z29.s, p3/M, z1.s, z13.s\n" + "fmla z28.s, p3/M, z1.s, z5.s\n" + "ld1w { z1.s }, p3/Z, [x8]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n" + "addvl x28, x28, #1\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "fmla z29.s, p3/M, z2.s, z5.s\n" + "fmla z28.s, p3/M, z2.s, z6.s\n" + "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x27]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "fmla z29.s, p3/M, z3.s, z6.s\n" + "fmla z28.s, p3/M, z3.s, z8.s\n" + "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z14.s\n" + "ld1w { z14.s }, p1/Z, [x10]\n" + "fmla z29.s, p3/M, z4.s, z8.s\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "ld1w { z13.s }, p1/Z, [x20, x26, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z5.s\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n" + "fmla z28.s, p3/M, z1.s, z9.s\n" + "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z5.s\n" + "ld1w { z5.s }, p1/Z, [x13]\n" + "fmla z30.s, p3/M, z2.s, z6.s\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n" + "whilelt p2.s, x17, %x[n_channels]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n" + "addvl x27, x27, #1\n" + "fmla z31.s, p3/M, z3.s, z6.s\n" + "ld1w { z6.s }, p1/Z, [x13, x15, LSL #2]\n" + "addvl x8, x8, #16\n" + "fmla z30.s, p3/M, z3.s, z8.s\n" + "cmp x16, %x[n_channels]\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p1/Z, [x13, x25, LSL #2]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z8.s\n" + "ld1w { z8.s }, p1/Z, [x20, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "ld1w { z10.s }, p1/Z, [x20, x23, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p1/Z, [x13, x24, LSL #2]\n" + "fmla z28.s, p3/M, z4.s, z9.s\n" + "ld1w { z9.s }, p1/Z, [x13, x26, LSL #2]\n" + "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "addvl x8, x8, #-6\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x11]\n" + "mov z31.d, z16.d\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "st1w { z30.s }, p0, [x11, x12, LSL #2]\n" + "mov z30.d, z16.d\n" + "addvl x11, x11, #1\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z29.s }, p0, [x22]\n" + "mov z29.d, z16.d\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z28.s }, p0, [x22, x12, LSL #2]\n" + "mov z28.d, z16.d\n" + "addvl x22, x22, #1\n" + "blt 2b\n" + "3:" // Tile loop: Channel tail + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x20, x25, LSL #2]\n" + "mov p0.b, p2.b\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "ldr x5, [%x[params_struct], %[offsetof_args_tile_i]]\n" + "add x21, x5, #0x1\n" + "fmla z29.s, p3/M, z0.s, z7.s\n" + "ldr x6, [%x[params_struct], %[offsetof_args_tile_j]]\n" + "fmla z28.s, p3/M, z0.s, z8.s\n" + "ld1w { z0.s }, p3/Z, [x8]\n" + "add x6, x6, #0x1\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x20, x24, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n" + "fmla z29.s, p3/M, z1.s, z8.s\n" + "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n" + "cmp x6, x19\n" + "fmla z28.s, p3/M, z1.s, z13.s\n" + "ld1w { z1.s }, p3/Z, [x8, #1, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x13, x23, LSL #2]\n" + "csel x6, x6, XZR, LT\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "csel x5, x5, x21, LT\n" + "fmla z29.s, p3/M, z2.s, z13.s\n" + "cmp x5, x20\n" + "fmla z28.s, p3/M, z2.s, z5.s\n" + "ld1w { z2.s }, p3/Z, [x8, #2, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x10, x15, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "fmla z29.s, p3/M, z3.s, z5.s\n" + "fmla z28.s, p3/M, z3.s, z6.s\n" + "ld1w { z3.s }, p3/Z, [x8, #3, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x10, x26, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x10, x25, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z6.s\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ld1w { z4.s }, p3/Z, [x8, #4, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z7.s\n" + "fmla z30.s, p3/M, z0.s, z8.s\n" + "fmla z29.s, p3/M, z0.s, z14.s\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z0.s }, p3/Z, [x8, #5, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z8.s\n" + "ld1w { z8.s }, p2/Z, [x10, x23, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z13.s\n" + "fmla z29.s, p3/M, z1.s, z11.s\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ld1w { z1.s }, p3/Z, [x8, #6, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x10, x24, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z5.s\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "fmla z28.s, p3/M, z2.s, z9.s\n" + "ld1w { z2.s }, p3/Z, [x8, #7, MUL VL]\n" + "addvl x8, x8, #16\n" + "fmla z31.s, p3/M, z3.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x9]\n" + "fmla z30.s, p3/M, z3.s, z6.s\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "ld1w { z3.s }, p3/Z, [x8, #-8, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x9, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x9, x26, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "fmla z28.s, p3/M, z4.s, z8.s\n" + "ld1w { z4.s }, p3/Z, [x8, #-7, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z14.s\n" + "ld1w { z14.s }, p2/Z, [x9, x23, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z11.s\n" + "fmla z29.s, p3/M, z0.s, z5.s\n" + "fmla z28.s, p3/M, z0.s, z6.s\n" + "ld1w { z0.s }, p3/Z, [x8, #-6, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x9, x25, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "fmla z29.s, p3/M, z1.s, z6.s\n" + "fmla z28.s, p3/M, z1.s, z10.s\n" + "ld1w { z1.s }, p3/Z, [x8, #-5, MUL VL]\n" + "fmla z31.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x9, x24, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z9.s\n" + "fmla z29.s, p3/M, z2.s, z10.s\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ld1w { z2.s }, p3/Z, [x8, #-4, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x28]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ld1w { z3.s }, p3/Z, [x8, #-3, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x28, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z8.s\n" + "ld1w { z8.s }, p2/Z, [x28, x24, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "fmla z28.s, p3/M, z4.s, z14.s\n" + "ld1w { z4.s }, p3/Z, [x8, #-2, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x28, x26, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "fmla z29.s, p3/M, z0.s, z9.s\n" + "fmla z28.s, p3/M, z0.s, z13.s\n" + "ld1w { z0.s }, p3/Z, [x8, #-1, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x28, x25, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "fmla z29.s, p3/M, z1.s, z13.s\n" + "fmla z28.s, p3/M, z1.s, z5.s\n" + "ld1w { z1.s }, p3/Z, [x8]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "ld1w { z10.s }, p2/Z, [x28, x23, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "fmla z29.s, p3/M, z2.s, z5.s\n" + "fmla z28.s, p3/M, z2.s, z6.s\n" + "ld1w { z2.s }, p3/Z, [x8, #1, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x27]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "fmla z29.s, p3/M, z3.s, z6.s\n" + "fmla z28.s, p3/M, z3.s, z8.s\n" + "ld1w { z3.s }, p3/Z, [x8, #2, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x15, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z14.s\n" + "fmla z29.s, p3/M, z4.s, z8.s\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ld1w { z4.s }, p3/Z, [x8, #3, MUL VL]\n" + "fmla z31.s, p3/M, z0.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x27, x26, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x27, x25, LSL #2]\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "fmla z30.s, p3/M, z1.s, z5.s\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x27, x24, LSL #2]\n" + "fmla z28.s, p3/M, z1.s, z9.s\n" + "fmla z31.s, p3/M, z2.s, z5.s\n" + "fmla z30.s, p3/M, z2.s, z6.s\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x27, x23, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "fmla z31.s, p3/M, z3.s, z6.s\n" + "fmla z30.s, p3/M, z3.s, z8.s\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "fmla z31.s, p3/M, z4.s, z8.s\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "fmla z28.s, p3/M, z4.s, z9.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x11]\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x11, x12, LSL #2]\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z29.s }, p0, [x22]\n" + "st1w { z28.s }, p0, [x22, x12, LSL #2]\n" + "blt 1b\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp new file mode 100644 index 0000000000..6e35ee86c5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *params, + unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + struct Args + { + float *const *outptrs; + const void *params; + const float min, max; + const float *inptrs[36]; + + Args( + const float *const *const input_ptrs, + float *const *const outptrs, + const void *const params, + const float min, + const float max + ) : outptrs(outptrs), params(params), min(min), max(max) + { + inptrs[0] = input_ptrs[0]; + inptrs[1] = input_ptrs[1]; + inptrs[2] = input_ptrs[6]; + inptrs[3] = input_ptrs[7]; + inptrs[4] = input_ptrs[2]; + inptrs[5] = input_ptrs[8]; + inptrs[6] = input_ptrs[3]; + inptrs[7] = input_ptrs[4]; + inptrs[8] = input_ptrs[11]; + inptrs[9] = input_ptrs[12]; + inptrs[10] = input_ptrs[9]; + inptrs[11] = input_ptrs[10]; + inptrs[12] = input_ptrs[5]; + inptrs[13] = input_ptrs[13]; + inptrs[14] = input_ptrs[14]; + inptrs[15] = input_ptrs[15]; + inptrs[16] = input_ptrs[16]; + inptrs[17] = input_ptrs[17]; + inptrs[18] = input_ptrs[18]; + inptrs[19] = input_ptrs[19]; + inptrs[20] = input_ptrs[20]; + inptrs[21] = input_ptrs[21]; + inptrs[22] = input_ptrs[22]; + inptrs[23] = input_ptrs[23]; + inptrs[24] = input_ptrs[24]; + inptrs[25] = input_ptrs[25]; + inptrs[26] = input_ptrs[26]; + inptrs[27] = input_ptrs[27]; + inptrs[28] = input_ptrs[28]; + inptrs[29] = input_ptrs[29]; + inptrs[30] = input_ptrs[30]; + inptrs[31] = input_ptrs[31]; + inptrs[32] = input_ptrs[32]; + inptrs[33] = input_ptrs[33]; + inptrs[34] = input_ptrs[34]; + inptrs[35] = input_ptrs[35]; + + } + }; + + Args params_struct(input_ptrs, outptrs, params, + activation_min, activation_max); + + __asm__ __volatile__( + "ldr x19, [%x[params_struct], %[offsetof_args_outptrs]]\n" + "ptrue p3.b\n" + "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n" + "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n" + "ld1rw { z18.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n" + "cntb x14, ALL, MUL #2\n" + "ld1rw { z17.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n" + "mov x13, #0x0\n" + "ldp x12, x11, [x19, #0x0]\n" + "cntw x10\n" + "ldp x9, x28, [x19, #0x10]\n" + "sub x27, XZR, x10\n" + "ld1w { z16.s }, p3/Z, [x16]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n" + "whilelt p2.s, XZR, %x[n_channels]\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n" + "cmp x10, %x[n_channels]\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n" + "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n" + "addvl x16, x16, #6\n" + "ldp x26, x25, [x15, #0x0]\n" + "ldp x24, x23, [x15, #0x10]\n" + "ldp x20, x19, [x15, #0x20]\n" + "ld1w { z5.s }, p2/Z, [x26, x13, LSL #2]\n" + "prfm pldl1keep, [x26, x14]\n" + "ld1w { z6.s }, p2/Z, [x25, x13, LSL #2]\n" + "prfm pldl1keep, [x25, x14]\n" + "ld1w { z7.s }, p2/Z, [x24, x13, LSL #2]\n" + "prfm pldl1keep, [x24, x14]\n" + "ld1w { z8.s }, p2/Z, [x23, x13, LSL #2]\n" + "prfm pldl1keep, [x23, x14]\n" + "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n" + "prfm pldl1keep, [x20, x14]\n" + "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n" + "prfm pldl1keep, [x19, x14]\n" + "ldp x22, x21, [x15, #0x30]\n" + "ldp x20, x19, [x15, #0x40]\n" + "ld1w { z11.s }, p2/Z, [x22, x13, LSL #2]\n" + "prfm pldl1keep, [x22, x14]\n" + "ld1w { z12.s }, p2/Z, [x21, x13, LSL #2]\n" + "prfm pldl1keep, [x21, x14]\n" + "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n" + "prfm pldl1keep, [x20, x14]\n" + "ld1w { z14.s }, p2/Z, [x19, x13, LSL #2]\n" + "prfm pldl1keep, [x19, x14]\n" + "bge 2f\n" + "1:" // Channel loop + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ldr x21, [x15, #0x50]\n" + "whilelt p1.s, x10, %x[n_channels]\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "ldr x19, [x15, #0x58]\n" + "incw x27\n" + "fmla z29.s, p3/M, z0.s, z7.s\n" + "ldr x20, [x15, #0x60]\n" + "mov p0.b, p2.b\n" + "fmla z28.s, p3/M, z0.s, z8.s\n" + "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z1.s, z8.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z28.s, p3/M, z1.s, z13.s\n" + "ld1w { z0.s }, p3/Z, [x16]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ldr x19, [x15, #0x68]\n" + "fmla z29.s, p3/M, z2.s, z13.s\n" + "ld1w { z1.s }, p3/Z, [x16, #1, MUL VL]\n" + "fmla z28.s, p3/M, z2.s, z5.s\n" + "ldr x20, [x15, #0x70]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z3.s, z5.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z28.s, p3/M, z3.s, z6.s\n" + "ld1w { z2.s }, p3/Z, [x16, #2, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z9.s\n" + "ldr x19, [x15, #0x78]\n" + "fmla z29.s, p3/M, z4.s, z6.s\n" + "ld1w { z3.s }, p3/Z, [x16, #3, MUL VL]\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ldr x26, [x15, #0x80]\n" + "fmla z31.s, p3/M, z0.s, z7.s\n" + "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z8.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z0.s, z14.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "ld1w { z4.s }, p3/Z, [x16, #4, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z8.s\n" + "ldr x25, [x15, #0x88]\n" + "fmla z30.s, p3/M, z1.s, z13.s\n" + "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n" + "fmla z29.s, p3/M, z1.s, z11.s\n" + "ldr x24, [x15, #0x90]\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z5.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z28.s, p3/M, z2.s, z9.s\n" + "ldr x23, [x15, #0x98]\n" + "fmla z31.s, p3/M, z3.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z6.s\n" + "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "ldr x20, [x15, #0xa0]\n" + "fmla z31.s, p3/M, z4.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "addvl x16, x16, #16\n" + "fmla z28.s, p3/M, z4.s, z8.s\n" + "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z31.s, p3/M, z0.s, z14.s\n" + "ldr x19, [x15, #0xa8]\n" + "fmla z30.s, p3/M, z0.s, z11.s\n" + "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n" + "fmla z29.s, p3/M, z0.s, z5.s\n" + "ldr x22, [x15, #0xb0]\n" + "fmla z28.s, p3/M, z0.s, z6.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z29.s, p3/M, z1.s, z6.s\n" + "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n" + "fmla z28.s, p3/M, z1.s, z10.s\n" + "ldr x21, [x15, #0xb8]\n" + "fmla z31.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z9.s\n" + "ld1w { z0.s }, p3/Z, [x16, #-6, MUL VL]\n" + "fmla z29.s, p3/M, z2.s, z10.s\n" + "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "ldr x20, [x15, #0xc0]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "ldr x19, [x15, #0xc8]\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "ld1w { z1.s }, p3/Z, [x16, #-5, MUL VL]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z30.s, p3/M, z4.s, z8.s\n" + "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z28.s, p3/M, z4.s, z14.s\n" + "ldr x21, [x15, #0xd0]\n" + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ld1w { z2.s }, p3/Z, [x16, #-4, MUL VL]\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "ldr x19, [x15, #0xd8]\n" + "fmla z29.s, p3/M, z0.s, z9.s\n" + "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n" + "fmla z28.s, p3/M, z0.s, z13.s\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z1.s, z13.s\n" + "ld1w { z3.s }, p3/Z, [x16, #-3, MUL VL]\n" + "fmla z28.s, p3/M, z1.s, z5.s\n" + "ldr x20, [x15, #0xe0]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "ld1w { z4.s }, p3/Z, [x16, #-2, MUL VL]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ldr x19, [x15, #0xe8]\n" + "fmla z29.s, p3/M, z2.s, z5.s\n" + "ld1w { z8.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z6.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z3.s, z6.s\n" + "ld1w { z0.s }, p3/Z, [x16, #-1, MUL VL]\n" + "fmla z28.s, p3/M, z3.s, z8.s\n" + "ldr x20, [x15, #0xf0]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ldr x19, [x15, #0xf8]\n" + "fmla z30.s, p3/M, z4.s, z14.s\n" + "ld1w { z1.s }, p3/Z, [x16]\n" + "fmla z29.s, p3/M, z4.s, z8.s\n" + "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z31.s, p3/M, z0.s, z9.s\n" + "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ldr x26, [x15, #0x100]\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z2.s }, p3/Z, [x16, #1, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "ldr x25, [x15, #0x108]\n" + "fmla z30.s, p3/M, z1.s, z5.s\n" + "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z28.s, p3/M, z1.s, z9.s\n" + "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z5.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z30.s, p3/M, z2.s, z6.s\n" + "ld1w { z3.s }, p3/Z, [x16, #2, MUL VL]\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "ldr x24, [x15, #0x110]\n" + "ld1w { z4.s }, p3/Z, [x16, #3, MUL VL]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ldr x23, [x15, #0x118]\n" + "fmla z31.s, p3/M, z3.s, z6.s\n" + "fmla z30.s, p3/M, z3.s, z8.s\n" + "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "prfm pldl1keep, [x24, x14]\n" + "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x23, x14]\n" + "incw x13\n" + "fmla z31.s, p3/M, z4.s, z8.s\n" + "ldp x26, x25, [x15, #0x0]\n" + "whilelt p2.s, x13, %x[n_channels]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "ldp x24, x23, [x15, #0x10]\n" + "addvl x14, x14, #1\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "ldp x20, x19, [x15, #0x20]\n" + "ldp x22, x21, [x15, #0x30]\n" + "fmla z28.s, p3/M, z4.s, z9.s\n" + "ld1w { z5.s }, p1/Z, [x26, x10, LSL #2]\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "ld1w { z6.s }, p1/Z, [x25, x10, LSL #2]\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "ld1w { z7.s }, p1/Z, [x24, x10, LSL #2]\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "ld1w { z8.s }, p1/Z, [x23, x10, LSL #2]\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "ld1w { z9.s }, p1/Z, [x20, x10, LSL #2]\n" + "prfm pldl1keep, [x20, x14]\n" + "ld1w { z13.s }, p1/Z, [x19, x10, LSL #2]\n" + "prfm pldl1keep, [x19, x14]\n" + "ld1w { z11.s }, p1/Z, [x22, x10, LSL #2]\n" + "prfm pldl1keep, [x22, x14]\n" + "ld1w { z12.s }, p1/Z, [x21, x10, LSL #2]\n" + "prfm pldl1keep, [x21, x14]\n" + "ldp x20, x19, [x15, #0x40]\n" + "st1w { z31.s }, p0, [x12, x27, LSL #2]\n" + "st1w { z30.s }, p0, [x11, x27, LSL #2]\n" + "ld1w { z10.s }, p1/Z, [x20, x10, LSL #2]\n" + "prfm pldl1keep, [x20, x14]\n" + "ld1w { z14.s }, p1/Z, [x19, x10, LSL #2]\n" + "incw x10\n" + "prfm pldl1keep, [x19, x14]\n" + "cmp x10, %x[n_channels]\n" + "st1w { z29.s }, p0, [x9, x27, LSL #2]\n" + "st1w { z28.s }, p0, [x28, x27, LSL #2]\n" + "ld1w { z16.s }, p3/Z, [x16, #4, MUL VL]\n" + "mov z31.d, z16.d\n" + "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n" + "mov z30.d, z16.d\n" + "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n" + "mov z29.d, z16.d\n" + "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n" + "addvl x16, x16, #16\n" + "mov z28.d, z16.d\n" + "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n" + "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n" + "addvl x16, x16, #-6\n" + "blt 1b\n" + "2:" // Channel tail + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ldr x21, [x15, #0x50]\n" + "incw x27\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "ldr x19, [x15, #0x58]\n" + "mov p0.b, p2.b\n" + "fmla z29.s, p3/M, z0.s, z7.s\n" + "ldr x20, [x15, #0x60]\n" + "fmla z28.s, p3/M, z0.s, z8.s\n" + "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z9.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z1.s, z8.s\n" + "fmla z28.s, p3/M, z1.s, z13.s\n" + "prfm pldl1keep, [x20, x14]\n" + "ld1w { z0.s }, p3/Z, [x16]\n" + "fmla z31.s, p3/M, z2.s, z9.s\n" + "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ldr x19, [x15, #0x68]\n" + "fmla z29.s, p3/M, z2.s, z13.s\n" + "fmla z28.s, p3/M, z2.s, z5.s\n" + "ld1w { z1.s }, p3/Z, [x16, #1, MUL VL]\n" + "ldr x20, [x15, #0x70]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z3.s, z5.s\n" + "fmla z28.s, p3/M, z3.s, z6.s\n" + "prfm pldl1keep, [x20, x14]\n" + "ld1w { z2.s }, p3/Z, [x16, #2, MUL VL]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z9.s\n" + "ldr x19, [x15, #0x78]\n" + "fmla z29.s, p3/M, z4.s, z6.s\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "ld1w { z3.s }, p3/Z, [x16, #3, MUL VL]\n" + "ldr x26, [x15, #0x80]\n" + "fmla z31.s, p3/M, z0.s, z7.s\n" + "ld1w { z9.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z8.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z0.s, z14.s\n" + "fmla z28.s, p3/M, z0.s, z11.s\n" + "prfm pldl1keep, [x26, x14]\n" + "ld1w { z4.s }, p3/Z, [x16, #4, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z8.s\n" + "ldr x25, [x15, #0x88]\n" + "fmla z30.s, p3/M, z1.s, z13.s\n" + "ld1w { z0.s }, p3/Z, [x16, #5, MUL VL]\n" + "fmla z29.s, p3/M, z1.s, z11.s\n" + "fmla z28.s, p3/M, z1.s, z12.s\n" + "ld1w { z8.s }, p2/Z, [x25, x13, LSL #2]\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z31.s, p3/M, z2.s, z13.s\n" + "ld1w { z13.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z5.s\n" + "ldr x24, [x15, #0x90]\n" + "fmla z29.s, p3/M, z2.s, z12.s\n" + "fmla z28.s, p3/M, z2.s, z9.s\n" + "ldr x23, [x15, #0x98]\n" + "ld1w { z1.s }, p3/Z, [x16, #6, MUL VL]\n" + "fmla z31.s, p3/M, z3.s, z5.s\n" + "ld1w { z5.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z6.s\n" + "prfm pldl1keep, [x24, x14]\n" + "fmla z29.s, p3/M, z3.s, z9.s\n" + "fmla z28.s, p3/M, z3.s, z13.s\n" + "prfm pldl1keep, [x23, x14]\n" + "ldr x20, [x15, #0xa0]\n" + "fmla z31.s, p3/M, z4.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "ld1w { z2.s }, p3/Z, [x16, #7, MUL VL]\n" + "fmla z29.s, p3/M, z4.s, z13.s\n" + "addvl x16, x16, #16\n" + "fmla z28.s, p3/M, z4.s, z8.s\n" + "ld1w { z10.s }, p2/Z, [x20, x13, LSL #2]\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z31.s, p3/M, z0.s, z14.s\n" + "ldr x19, [x15, #0xa8]\n" + "fmla z30.s, p3/M, z0.s, z11.s\n" + "ld1w { z3.s }, p3/Z, [x16, #-8, MUL VL]\n" + "fmla z29.s, p3/M, z0.s, z5.s\n" + "ldr x22, [x15, #0xb0]\n" + "fmla z28.s, p3/M, z0.s, z6.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z31.s, p3/M, z1.s, z11.s\n" + "ld1w { z11.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x22, x14]\n" + "fmla z29.s, p3/M, z1.s, z6.s\n" + "ld1w { z4.s }, p3/Z, [x16, #-7, MUL VL]\n" + "fmla z28.s, p3/M, z1.s, z10.s\n" + "ldr x21, [x15, #0xb8]\n" + "fmla z31.s, p3/M, z2.s, z12.s\n" + "ld1w { z12.s }, p2/Z, [x22, x13, LSL #2]\n" + "fmla z30.s, p3/M, z2.s, z9.s\n" + "ld1w { z0.s }, p3/Z, [x16, #-6, MUL VL]\n" + "fmla z29.s, p3/M, z2.s, z10.s\n" + "ld1w { z14.s }, p2/Z, [x21, x13, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z31.s, p3/M, z3.s, z9.s\n" + "ldr x20, [x15, #0xc0]\n" + "fmla z30.s, p3/M, z3.s, z13.s\n" + "ldr x19, [x15, #0xc8]\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "ld1w { z1.s }, p3/Z, [x16, #-5, MUL VL]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "ld1w { z9.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z31.s, p3/M, z4.s, z13.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z30.s, p3/M, z4.s, z8.s\n" + "ld1w { z13.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z28.s, p3/M, z4.s, z14.s\n" + "ldr x21, [x15, #0xd0]\n" + "fmla z31.s, p3/M, z0.s, z5.s\n" + "ld1w { z2.s }, p3/Z, [x16, #-4, MUL VL]\n" + "fmla z30.s, p3/M, z0.s, z6.s\n" + "ldr x19, [x15, #0xd8]\n" + "fmla z29.s, p3/M, z0.s, z9.s\n" + "ld1w { z5.s }, p2/Z, [x21, x13, LSL #2]\n" + "fmla z28.s, p3/M, z0.s, z13.s\n" + "prfm pldl1keep, [x21, x14]\n" + "fmla z31.s, p3/M, z1.s, z6.s\n" + "ld1w { z6.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z1.s, z10.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z1.s, z13.s\n" + "ld1w { z3.s }, p3/Z, [x16, #-3, MUL VL]\n" + "fmla z28.s, p3/M, z1.s, z5.s\n" + "ldr x20, [x15, #0xe0]\n" + "fmla z31.s, p3/M, z2.s, z10.s\n" + "ld1w { z4.s }, p3/Z, [x16, #-2, MUL VL]\n" + "fmla z30.s, p3/M, z2.s, z11.s\n" + "ldr x19, [x15, #0xe8]\n" + "fmla z29.s, p3/M, z2.s, z5.s\n" + "ld1w { z8.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z28.s, p3/M, z2.s, z6.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z31.s, p3/M, z3.s, z11.s\n" + "ld1w { z10.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z3.s, z6.s\n" + "ld1w { z0.s }, p3/Z, [x16, #-1, MUL VL]\n" + "fmla z28.s, p3/M, z3.s, z8.s\n" + "ldr x20, [x15, #0xf0]\n" + "fmla z31.s, p3/M, z4.s, z12.s\n" + "ldr x19, [x15, #0xf8]\n" + "fmla z30.s, p3/M, z4.s, z14.s\n" + "ld1w { z1.s }, p3/Z, [x16]\n" + "fmla z29.s, p3/M, z4.s, z8.s\n" + "ld1w { z11.s }, p2/Z, [x20, x13, LSL #2]\n" + "fmla z28.s, p3/M, z4.s, z10.s\n" + "prfm pldl1keep, [x20, x14]\n" + "fmla z31.s, p3/M, z0.s, z9.s\n" + "ld1w { z12.s }, p2/Z, [x19, x13, LSL #2]\n" + "fmla z30.s, p3/M, z0.s, z13.s\n" + "prfm pldl1keep, [x19, x14]\n" + "fmla z29.s, p3/M, z0.s, z11.s\n" + "ldr x26, [x15, #0x100]\n" + "fmla z28.s, p3/M, z0.s, z12.s\n" + "ld1w { z2.s }, p3/Z, [x16, #1, MUL VL]\n" + "fmla z31.s, p3/M, z1.s, z13.s\n" + "ldr x25, [x15, #0x108]\n" + "fmla z30.s, p3/M, z1.s, z5.s\n" + "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n" + "fmla z29.s, p3/M, z1.s, z12.s\n" + "prfm pldl1keep, [x26, x14]\n" + "fmla z28.s, p3/M, z1.s, z9.s\n" + "ld1w { z11.s }, p2/Z, [x25, x13, LSL #2]\n" + "fmla z31.s, p3/M, z2.s, z5.s\n" + "prfm pldl1keep, [x25, x14]\n" + "fmla z30.s, p3/M, z2.s, z6.s\n" + "ld1w { z3.s }, p3/Z, [x16, #2, MUL VL]\n" + "fmla z29.s, p3/M, z2.s, z9.s\n" + "ldr x24, [x15, #0x110]\n" + "ld1w { z4.s }, p3/Z, [x16, #3, MUL VL]\n" + "fmla z28.s, p3/M, z2.s, z11.s\n" + "ldr x23, [x15, #0x118]\n" + "fmla z31.s, p3/M, z3.s, z6.s\n" + "fmla z30.s, p3/M, z3.s, z8.s\n" + "ld1w { z12.s }, p2/Z, [x24, x13, LSL #2]\n" + "fmla z29.s, p3/M, z3.s, z11.s\n" + "prfm pldl1keep, [x24, x14]\n" + "ld1w { z9.s }, p2/Z, [x23, x13, LSL #2]\n" + "fmla z28.s, p3/M, z3.s, z12.s\n" + "prfm pldl1keep, [x23, x14]\n" + "fmla z31.s, p3/M, z4.s, z8.s\n" + "fmla z30.s, p3/M, z4.s, z10.s\n" + "fmla z29.s, p3/M, z4.s, z12.s\n" + "fmla z28.s, p3/M, z4.s, z9.s\n" + "fmax z31.s, p3/M, z31.s, z18.s\n" + "fmax z30.s, p3/M, z30.s, z18.s\n" + "fmax z29.s, p3/M, z29.s, z18.s\n" + "fmin z31.s, p3/M, z31.s, z17.s\n" + "st1w { z31.s }, p0, [x12, x27, LSL #2]\n" + "fmin z30.s, p3/M, z30.s, z17.s\n" + "fmin z29.s, p3/M, z29.s, z17.s\n" + "st1w { z30.s }, p0, [x11, x27, LSL #2]\n" + "fmax z28.s, p3/M, z28.s, z18.s\n" + "st1w { z29.s }, p0, [x9, x27, LSL #2]\n" + "fmin z28.s, p3/M, z28.s, z17.s\n" + "st1w { z28.s }, p0, [x28, x27, LSL #2]\n" + : + : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) + : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp new file mode 100644 index 0000000000..dd2c519e3a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); + +struct sve_fp32_nhwc_generic_output9_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int n_output_points = 9; + + kern_type kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl; + + sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..370218e1d4 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const void *params, + const void *bias, + const unsigned int n_points, + const unsigned int n_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ptrue p1.b\n" + "ld1rw { z4.s }, p1/Z, [%x[minmax_vals]]\n" + "mov x28, #0x0\n" + "ld1rw { z3.s }, p1/Z, [%x[minmax_vals], #4]\n" + "whilelt p0.s, x28, %x[n_channels]\n" + "1:" // Channel loop + "mov z2.b, #0x0\n" + "cbz %x[bias], 2f\n" + "ld1w { z2.s }, p0/Z, [%x[bias], x28, LSL #2]\n" + "2:" // Channel loop: Load bias: Done + "mov z1.d, z2.d\n" + "ld1w { z0.s }, p1/Z, [%x[params]]\n" + "mov x22, %x[inptrs]\n" + "mov z31.d, z2.d\n" + "ldp x20, x19, [x22], #0x10\n" + "subs x21, %x[n_points], #0x1\n" + "mov z30.d, z2.d\n" + "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n" + "mov z28.d, z2.d\n" + "addvl %x[params], %x[params], #1\n" + "mov z27.d, z2.d\n" + "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n" + "mov z25.d, z2.d\n" + "ldp x20, x19, [x22], #0x10\n" + "mov z24.d, z2.d\n" + "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n" + "mov z22.d, z2.d\n" + "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n" + "ldp x20, x19, [x22], #0x10\n" + "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n" + "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n" + "ldp x20, x19, [x22], #0x10\n" + "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n" + "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n" + "ldr x19, [x22], #0x8\n" + "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n" + "ble 4f\n" + "3:" // Channel loop: Planar loop + "fmla z2.s, p1/M, z29.s, z0.s\n" + "ldp x20, x19, [x22], #0x10\n" + "subs x21, x21, #0x1\n" + "fmla z1.s, p1/M, z26.s, z0.s\n" + "ld1w { z29.s }, p0/Z, [x20, x28, LSL #2]\n" + "fmla z31.s, p1/M, z23.s, z0.s\n" + "fmla z30.s, p1/M, z21.s, z0.s\n" + "ld1w { z26.s }, p0/Z, [x19, x28, LSL #2]\n" + "fmla z28.s, p1/M, z20.s, z0.s\n" + "ldp x20, x19, [x22], #0x10\n" + "fmla z27.s, p1/M, z19.s, z0.s\n" + "ld1w { z23.s }, p0/Z, [x20, x28, LSL #2]\n" + "fmla z25.s, p1/M, z18.s, z0.s\n" + "fmla z24.s, p1/M, z17.s, z0.s\n" + "ld1w { z21.s }, p0/Z, [x19, x28, LSL #2]\n" + "fmla z22.s, p1/M, z16.s, z0.s\n" + "ld1w { z0.s }, p1/Z, [%x[params]]\n" + "addvl %x[params], %x[params], #1\n" + "ldp x20, x19, [x22], #0x10\n" + "ld1w { z20.s }, p0/Z, [x20, x28, LSL #2]\n" + "ld1w { z19.s }, p0/Z, [x19, x28, LSL #2]\n" + "ldp x20, x19, [x22], #0x10\n" + "ld1w { z18.s }, p0/Z, [x20, x28, LSL #2]\n" + "ld1w { z17.s }, p0/Z, [x19, x28, LSL #2]\n" + "ldr x19, [x22], #0x8\n" + "ld1w { z16.s }, p0/Z, [x19, x28, LSL #2]\n" + "bgt 3b\n" + "4:" // Channel loop: Planar tail + "fmla z2.s, p1/M, z29.s, z0.s\n" + "ldp x27, x26, [%x[outptrs], #0x0]\n" + "fmla z1.s, p1/M, z26.s, z0.s\n" + "ldp x25, x24, [%x[outptrs], #0x10]\n" + "fmla z31.s, p1/M, z23.s, z0.s\n" + "ldp x23, x22, [%x[outptrs], #0x20]\n" + "fmla z30.s, p1/M, z21.s, z0.s\n" + "ldp x21, x20, [%x[outptrs], #0x30]\n" + "fmla z28.s, p1/M, z20.s, z0.s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmla z27.s, p1/M, z19.s, z0.s\n" + "fmla z25.s, p1/M, z18.s, z0.s\n" + "fmla z24.s, p1/M, z17.s, z0.s\n" + "fmla z22.s, p1/M, z16.s, z0.s\n" + "fmax z2.s, p1/M, z2.s, z4.s\n" + "fmax z1.s, p1/M, z1.s, z4.s\n" + "fmax z31.s, p1/M, z31.s, z4.s\n" + "fmax z30.s, p1/M, z30.s, z4.s\n" + "fmin z2.s, p1/M, z2.s, z3.s\n" + "st1w { z2.s }, p0, [x27, x28, LSL #2]\n" + "fmin z1.s, p1/M, z1.s, z3.s\n" + "fmin z31.s, p1/M, z31.s, z3.s\n" + "st1w { z1.s }, p0, [x26, x28, LSL #2]\n" + "fmin z30.s, p1/M, z30.s, z3.s\n" + "fmax z28.s, p1/M, z28.s, z4.s\n" + "st1w { z31.s }, p0, [x25, x28, LSL #2]\n" + "fmax z27.s, p1/M, z27.s, z4.s\n" + "st1w { z30.s }, p0, [x24, x28, LSL #2]\n" + "fmin z28.s, p1/M, z28.s, z3.s\n" + "fmax z25.s, p1/M, z25.s, z4.s\n" + "st1w { z28.s }, p0, [x23, x28, LSL #2]\n" + "fmin z27.s, p1/M, z27.s, z3.s\n" + "fmin z25.s, p1/M, z25.s, z3.s\n" + "st1w { z27.s }, p0, [x22, x28, LSL #2]\n" + "fmax z24.s, p1/M, z24.s, z4.s\n" + "fmax z22.s, p1/M, z22.s, z4.s\n" + "st1w { z25.s }, p0, [x21, x28, LSL #2]\n" + "fmin z24.s, p1/M, z24.s, z3.s\n" + "st1w { z24.s }, p0, [x20, x28, LSL #2]\n" + "fmin z22.s, p1/M, z22.s, z3.s\n" + "st1w { z22.s }, p0, [x19, x28, LSL #2]\n" + "incw x28\n" + "whilelt p0.s, x28, %x[n_channels]\n" + "b.any 1b\n" + : [params] "+&r" (params) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs) + : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp new file mode 100644 index 0000000000..5cf3314c65 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + +struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 3; + constexpr static unsigned int output_cols = 3; + + constexpr static unsigned int input_rows = 7; + constexpr static unsigned int input_cols = 7; + constexpr static unsigned int input_col_quads = 2; + + kern_type kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl; + + sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..ce640a207d --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const void *params, + const unsigned int n_output_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ldp x12, x11, [%x[outptrs], #0x0]\n" + "ptrue p2.b\n" + "ldp x10, x9, [%x[outptrs], #0x10]\n" + "mov x28, #0x0\n" + "ldp x27, x26, [%x[outptrs], #0x20]\n" + "mov x25, #0x0\n" + "ldp x24, x23, [%x[outptrs], #0x30]\n" + "whilelt p1.s, x28, %x[channel_multiplier]\n" + "ldr x22, [%x[outptrs], #0x40]\n" + "ldr x21, [%x[inptrs], #0x0]\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "ld1rqw { z2.s }, p2/Z, [x21]\n" + "ld1rqw { z3.s }, p2/Z, [x21, #16]\n" + "ld1rqw { z4.s }, p2/Z, [x20]\n" + "ld1rqw { z5.s }, p2/Z, [x20, #16]\n" + "ld1rqw { z6.s }, p2/Z, [x19]\n" + "ld1rqw { z7.s }, p2/Z, [x19, #16]\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "ldr x19, [%x[inptrs], #0x28]\n" + "ld1rqw { z8.s }, p2/Z, [x21]\n" + "ld1rqw { z9.s }, p2/Z, [x21, #16]\n" + "ld1rqw { z10.s }, p2/Z, [x20]\n" + "ld1rqw { z11.s }, p2/Z, [x20, #16]\n" + "ld1rqw { z12.s }, p2/Z, [x19]\n" + "ld1rqw { z13.s }, p2/Z, [x19, #16]\n" + "ldr x19, [%x[inptrs], #0x30]\n" + "ld1rw { z26.s }, p2/Z, [%x[clamps]]\n" + "ld1rw { z25.s }, p2/Z, [%x[clamps], #4]\n" + "ld1rqw { z14.s }, p2/Z, [x19]\n" + "ld1rqw { z15.s }, p2/Z, [x19, #16]\n" + "ld1w { z24.s }, p1/Z, [%x[params]]\n" + "mov z23.d, z24.d\n" + "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n" + "mov z22.d, z24.d\n" + "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n" + "mov z21.d, z24.d\n" + "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n" + "addvl %x[params], %x[params], #4\n" + "mov z20.d, z24.d\n" + "mov z19.d, z24.d\n" + "mov z18.d, z24.d\n" + "mov z17.d, z24.d\n" + "mov z16.d, z24.d\n" + "1:" // Output channel complete vector loop + "mov z0.d, z10.d\n" + "mov p0.b, p1.b\n" + "mov z1.d, z11.d\n" + "incw x28\n" + "fmla z24.s, z31.s, z2.s[0]\n" + "whilelt p1.s, x28, %x[channel_multiplier]\n" + "fmla z23.s, z31.s, z2.s[2]\n" + "fmla z22.s, z31.s, z3.s[0]\n" + "fmla z21.s, z31.s, z6.s[0]\n" + "fmla z20.s, z31.s, z6.s[2]\n" + "fmla z19.s, z31.s, z7.s[0]\n" + "fmla z18.s, z31.s, z0.s[0]\n" + "fmla z17.s, z31.s, z0.s[2]\n" + "fmla z16.s, z31.s, z1.s[0]\n" + "ld1w { z31.s }, p2/Z, [%x[params]]\n" + "fmla z24.s, z30.s, z2.s[1]\n" + "fmla z23.s, z30.s, z2.s[3]\n" + "fmla z22.s, z30.s, z3.s[1]\n" + "fmla z21.s, z30.s, z6.s[1]\n" + "fmla z20.s, z30.s, z6.s[3]\n" + "fmla z19.s, z30.s, z7.s[1]\n" + "fmla z18.s, z30.s, z0.s[1]\n" + "fmla z17.s, z30.s, z0.s[3]\n" + "fmla z16.s, z30.s, z1.s[1]\n" + "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "fmla z24.s, z29.s, z2.s[2]\n" + "fmla z23.s, z29.s, z3.s[0]\n" + "fmla z22.s, z29.s, z3.s[2]\n" + "fmla z21.s, z29.s, z6.s[2]\n" + "fmla z20.s, z29.s, z7.s[0]\n" + "fmla z19.s, z29.s, z7.s[2]\n" + "fmla z18.s, z29.s, z0.s[2]\n" + "mov z0.d, z8.d\n" + "fmla z17.s, z29.s, z1.s[0]\n" + "fmla z16.s, z29.s, z1.s[2]\n" + "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "mov z1.d, z9.d\n" + "fmla z24.s, z31.s, z4.s[0]\n" + "fmla z23.s, z31.s, z4.s[2]\n" + "fmla z22.s, z31.s, z5.s[0]\n" + "fmla z21.s, z31.s, z0.s[0]\n" + "fmla z20.s, z31.s, z0.s[2]\n" + "mov z0.d, z12.d\n" + "fmla z19.s, z31.s, z1.s[0]\n" + "mov z1.d, z13.d\n" + "fmla z18.s, z31.s, z0.s[0]\n" + "fmla z17.s, z31.s, z0.s[2]\n" + "mov z0.d, z8.d\n" + "fmla z16.s, z31.s, z1.s[0]\n" + "ld1w { z31.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "mov z1.d, z9.d\n" + "fmla z24.s, z30.s, z4.s[1]\n" + "fmla z23.s, z30.s, z4.s[3]\n" + "fmla z22.s, z30.s, z5.s[1]\n" + "fmla z21.s, z30.s, z0.s[1]\n" + "fmla z20.s, z30.s, z0.s[3]\n" + "mov z0.d, z12.d\n" + "fmla z19.s, z30.s, z1.s[1]\n" + "mov z1.d, z13.d\n" + "fmla z18.s, z30.s, z0.s[1]\n" + "fmla z17.s, z30.s, z0.s[3]\n" + "mov z0.d, z8.d\n" + "fmla z16.s, z30.s, z1.s[1]\n" + "ld1w { z30.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "mov z1.d, z9.d\n" + "fmla z24.s, z29.s, z4.s[2]\n" + "fmla z23.s, z29.s, z5.s[0]\n" + "fmla z22.s, z29.s, z5.s[2]\n" + "fmla z21.s, z29.s, z0.s[2]\n" + "mov z0.d, z12.d\n" + "fmla z20.s, z29.s, z1.s[0]\n" + "fmla z19.s, z29.s, z1.s[2]\n" + "mov z1.d, z13.d\n" + "fmla z18.s, z29.s, z0.s[2]\n" + "mov z0.d, z10.d\n" + "fmla z17.s, z29.s, z1.s[0]\n" + "fmla z16.s, z29.s, z1.s[2]\n" + "ld1w { z29.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "mov z1.d, z11.d\n" + "fmla z24.s, z31.s, z6.s[0]\n" + "fmla z23.s, z31.s, z6.s[2]\n" + "fmla z22.s, z31.s, z7.s[0]\n" + "fmla z21.s, z31.s, z0.s[0]\n" + "fmla z20.s, z31.s, z0.s[2]\n" + "mov z0.d, z14.d\n" + "fmla z19.s, z31.s, z1.s[0]\n" + "mov z1.d, z15.d\n" + "fmla z18.s, z31.s, z0.s[0]\n" + "fmla z17.s, z31.s, z0.s[2]\n" + "mov z0.d, z10.d\n" + "fmla z16.s, z31.s, z1.s[0]\n" + "ld1w { z31.s }, p1/Z, [%x[params], #7, MUL VL]\n" + "mov z1.d, z11.d\n" + "fmla z24.s, z30.s, z6.s[1]\n" + "fmla z23.s, z30.s, z6.s[3]\n" + "fmla z22.s, z30.s, z7.s[1]\n" + "fmla z21.s, z30.s, z0.s[1]\n" + "fmla z20.s, z30.s, z0.s[3]\n" + "mov z0.d, z14.d\n" + "fmla z19.s, z30.s, z1.s[1]\n" + "mov z1.d, z15.d\n" + "fmla z18.s, z30.s, z0.s[1]\n" + "fmla z17.s, z30.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z16.s, z30.s, z1.s[1]\n" + "mov z1.d, z11.d\n" + "fmla z24.s, z29.s, z6.s[2]\n" + "fmla z23.s, z29.s, z7.s[0]\n" + "fmla z22.s, z29.s, z7.s[2]\n" + "fmla z21.s, z29.s, z0.s[2]\n" + "mov z0.d, z14.d\n" + "fmla z20.s, z29.s, z1.s[0]\n" + "fmla z19.s, z29.s, z1.s[2]\n" + "mov z1.d, z15.d\n" + "fmla z18.s, z29.s, z0.s[2]\n" + "fmla z17.s, z29.s, z1.s[0]\n" + "fmla z16.s, z29.s, z1.s[2]\n" + "fmin z24.s, p2/M, z24.s, z25.s\n" + "fmin z23.s, p2/M, z23.s, z25.s\n" + "fmin z22.s, p2/M, z22.s, z25.s\n" + "fmin z21.s, p2/M, z21.s, z25.s\n" + "fmax z24.s, p2/M, z24.s, z26.s\n" + "st1w { z24.s }, p0, [x12, x25, LSL #2]\n" + "fmax z23.s, p2/M, z23.s, z26.s\n" + "fmax z22.s, p2/M, z22.s, z26.s\n" + "ld1w { z24.s }, p1/Z, [%x[params], #6, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "fmax z21.s, p2/M, z21.s, z26.s\n" + "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n" + "fmin z20.s, p2/M, z20.s, z25.s\n" + "ld1w { z29.s }, p1/Z, [%x[params], #-7, MUL VL]\n" + "addvl %x[params], %x[params], #-6\n" + "fmin z19.s, p2/M, z19.s, z25.s\n" + "st1w { z23.s }, p0, [x11, x25, LSL #2]\n" + "mov z23.d, z24.d\n" + "st1w { z22.s }, p0, [x10, x25, LSL #2]\n" + "mov z22.d, z24.d\n" + "st1w { z21.s }, p0, [x9, x25, LSL #2]\n" + "mov z21.d, z24.d\n" + "fmax z20.s, p2/M, z20.s, z26.s\n" + "st1w { z20.s }, p0, [x27, x25, LSL #2]\n" + "mov z20.d, z24.d\n" + "fmax z19.s, p2/M, z19.s, z26.s\n" + "st1w { z19.s }, p0, [x26, x25, LSL #2]\n" + "mov z19.d, z24.d\n" + "fmin z18.s, p2/M, z18.s, z25.s\n" + "fmin z17.s, p2/M, z17.s, z25.s\n" + "fmin z16.s, p2/M, z16.s, z25.s\n" + "fmax z18.s, p2/M, z18.s, z26.s\n" + "st1w { z18.s }, p0, [x24, x25, LSL #2]\n" + "mov z18.d, z24.d\n" + "fmax z17.s, p2/M, z17.s, z26.s\n" + "st1w { z17.s }, p0, [x23, x25, LSL #2]\n" + "mov z17.d, z24.d\n" + "fmax z16.s, p2/M, z16.s, z26.s\n" + "st1w { z16.s }, p0, [x22, x25, LSL #2]\n" + "mov z16.d, z24.d\n" + "incw x25\n" + "b.any 1b\n" + : [params] "+&r" (params) + : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp new file mode 100644 index 0000000000..3c2f77156d --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + +struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 8; + constexpr static unsigned int input_col_quads = 2; + + kern_type kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl; + + sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..453b00c0db --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const void *params, + const unsigned int n_output_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ldp x11, x10, [%x[outptrs], #0x0]\n" + "ptrue p2.b\n" + "ldp x9, x28, [%x[outptrs], #0x10]\n" + "mov x27, #0x0\n" + "ldp x26, x25, [%x[outptrs], #0x20]\n" + "mov x24, #0x0\n" + "ldp x23, x22, [%x[outptrs], #0x30]\n" + "whilelt p1.s, x27, %x[channel_multiplier]\n" + "ldr x21, [%x[inptrs], #0x0]\n" + "ldr x20, [%x[inptrs], #0x8]\n" + "ldr x19, [%x[inptrs], #0x10]\n" + "ld1rqw { z2.s }, p2/Z, [x21]\n" + "ld1rqw { z3.s }, p2/Z, [x21, #16]\n" + "ld1rqw { z4.s }, p2/Z, [x20]\n" + "ld1rqw { z5.s }, p2/Z, [x20, #16]\n" + "ld1rqw { z6.s }, p2/Z, [x19]\n" + "ld1rqw { z7.s }, p2/Z, [x19, #16]\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "ldr x19, [%x[inptrs], #0x28]\n" + "ld1rqw { z8.s }, p2/Z, [x21]\n" + "ld1rqw { z9.s }, p2/Z, [x21, #16]\n" + "ld1rqw { z10.s }, p2/Z, [x20]\n" + "ld1rqw { z11.s }, p2/Z, [x20, #16]\n" + "ld1rqw { z12.s }, p2/Z, [x19]\n" + "ld1rqw { z13.s }, p2/Z, [x19, #16]\n" + "ld1rw { z25.s }, p2/Z, [%x[clamps]]\n" + "ld1rw { z24.s }, p2/Z, [%x[clamps], #4]\n" + "ld1w { z23.s }, p1/Z, [%x[params]]\n" + "mov z22.d, z23.d\n" + "ld1w { z31.s }, p1/Z, [%x[params], #1, MUL VL]\n" + "mov z21.d, z23.d\n" + "ld1w { z30.s }, p1/Z, [%x[params], #2, MUL VL]\n" + "mov z20.d, z23.d\n" + "ld1w { z29.s }, p1/Z, [%x[params], #3, MUL VL]\n" + "mov z19.d, z23.d\n" + "ld1w { z28.s }, p1/Z, [%x[params], #4, MUL VL]\n" + "mov z18.d, z23.d\n" + "ld1w { z27.s }, p1/Z, [%x[params], #5, MUL VL]\n" + "addvl %x[params], %x[params], #6\n" + "mov z17.d, z23.d\n" + "mov z16.d, z23.d\n" + "1:" // Output channel complete vector loop + "mov z0.d, z8.d\n" + "mov p0.b, p1.b\n" + "mov z1.d, z9.d\n" + "incw x27\n" + "fmla z23.s, z31.s, z2.s[0]\n" + "whilelt p1.s, x27, %x[channel_multiplier]\n" + "fmla z22.s, z31.s, z2.s[1]\n" + "fmla z21.s, z31.s, z2.s[2]\n" + "fmla z20.s, z31.s, z2.s[3]\n" + "fmla z19.s, z31.s, z4.s[0]\n" + "fmla z18.s, z31.s, z4.s[1]\n" + "fmla z17.s, z31.s, z4.s[2]\n" + "fmla z16.s, z31.s, z4.s[3]\n" + "ld1w { z31.s }, p2/Z, [%x[params]]\n" + "fmla z23.s, z30.s, z2.s[1]\n" + "fmla z22.s, z30.s, z2.s[2]\n" + "fmla z21.s, z30.s, z2.s[3]\n" + "fmla z20.s, z30.s, z3.s[0]\n" + "fmla z19.s, z30.s, z4.s[1]\n" + "fmla z18.s, z30.s, z4.s[2]\n" + "fmla z17.s, z30.s, z4.s[3]\n" + "fmla z16.s, z30.s, z5.s[0]\n" + "ld1w { z30.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "fmla z23.s, z29.s, z2.s[2]\n" + "fmla z22.s, z29.s, z2.s[3]\n" + "fmla z21.s, z29.s, z3.s[0]\n" + "fmla z20.s, z29.s, z3.s[1]\n" + "fmla z19.s, z29.s, z4.s[2]\n" + "fmla z18.s, z29.s, z4.s[3]\n" + "fmla z17.s, z29.s, z5.s[0]\n" + "fmla z16.s, z29.s, z5.s[1]\n" + "ld1w { z29.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "fmla z23.s, z28.s, z2.s[3]\n" + "fmla z22.s, z28.s, z3.s[0]\n" + "fmla z21.s, z28.s, z3.s[1]\n" + "fmla z20.s, z28.s, z3.s[2]\n" + "fmla z19.s, z28.s, z4.s[3]\n" + "fmla z18.s, z28.s, z5.s[0]\n" + "fmla z17.s, z28.s, z5.s[1]\n" + "fmla z16.s, z28.s, z5.s[2]\n" + "ld1w { z28.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "fmla z23.s, z27.s, z3.s[0]\n" + "fmla z22.s, z27.s, z3.s[1]\n" + "fmla z21.s, z27.s, z3.s[2]\n" + "fmla z20.s, z27.s, z3.s[3]\n" + "fmla z19.s, z27.s, z5.s[0]\n" + "fmla z18.s, z27.s, z5.s[1]\n" + "fmla z17.s, z27.s, z5.s[2]\n" + "fmla z16.s, z27.s, z5.s[3]\n" + "ld1w { z27.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "fmla z23.s, z31.s, z4.s[0]\n" + "fmla z22.s, z31.s, z4.s[1]\n" + "fmla z21.s, z31.s, z4.s[2]\n" + "fmla z20.s, z31.s, z4.s[3]\n" + "fmla z19.s, z31.s, z6.s[0]\n" + "fmla z18.s, z31.s, z6.s[1]\n" + "fmla z17.s, z31.s, z6.s[2]\n" + "fmla z16.s, z31.s, z6.s[3]\n" + "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "fmla z23.s, z30.s, z4.s[1]\n" + "fmla z22.s, z30.s, z4.s[2]\n" + "fmla z21.s, z30.s, z4.s[3]\n" + "fmla z20.s, z30.s, z5.s[0]\n" + "fmla z19.s, z30.s, z6.s[1]\n" + "fmla z18.s, z30.s, z6.s[2]\n" + "fmla z17.s, z30.s, z6.s[3]\n" + "fmla z16.s, z30.s, z7.s[0]\n" + "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "fmla z23.s, z29.s, z4.s[2]\n" + "fmla z22.s, z29.s, z4.s[3]\n" + "fmla z21.s, z29.s, z5.s[0]\n" + "fmla z20.s, z29.s, z5.s[1]\n" + "fmla z19.s, z29.s, z6.s[2]\n" + "fmla z18.s, z29.s, z6.s[3]\n" + "fmla z17.s, z29.s, z7.s[0]\n" + "fmla z16.s, z29.s, z7.s[1]\n" + "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "fmla z23.s, z28.s, z4.s[3]\n" + "fmla z22.s, z28.s, z5.s[0]\n" + "fmla z21.s, z28.s, z5.s[1]\n" + "fmla z20.s, z28.s, z5.s[2]\n" + "fmla z19.s, z28.s, z6.s[3]\n" + "fmla z18.s, z28.s, z7.s[0]\n" + "fmla z17.s, z28.s, z7.s[1]\n" + "fmla z16.s, z28.s, z7.s[2]\n" + "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n" + "fmla z23.s, z27.s, z5.s[0]\n" + "fmla z22.s, z27.s, z5.s[1]\n" + "fmla z21.s, z27.s, z5.s[2]\n" + "fmla z20.s, z27.s, z5.s[3]\n" + "fmla z19.s, z27.s, z7.s[0]\n" + "fmla z18.s, z27.s, z7.s[1]\n" + "fmla z17.s, z27.s, z7.s[2]\n" + "fmla z16.s, z27.s, z7.s[3]\n" + "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n" + "fmla z23.s, z31.s, z6.s[0]\n" + "fmla z22.s, z31.s, z6.s[1]\n" + "fmla z21.s, z31.s, z6.s[2]\n" + "fmla z20.s, z31.s, z6.s[3]\n" + "fmla z19.s, z31.s, z0.s[0]\n" + "fmla z18.s, z31.s, z0.s[1]\n" + "fmla z17.s, z31.s, z0.s[2]\n" + "fmla z16.s, z31.s, z0.s[3]\n" + "ld1w { z31.s }, p2/Z, [%x[params], #-6, MUL VL]\n" + "fmla z23.s, z30.s, z6.s[1]\n" + "fmla z22.s, z30.s, z6.s[2]\n" + "fmla z21.s, z30.s, z6.s[3]\n" + "fmla z20.s, z30.s, z7.s[0]\n" + "fmla z19.s, z30.s, z0.s[1]\n" + "fmla z18.s, z30.s, z0.s[2]\n" + "fmla z17.s, z30.s, z0.s[3]\n" + "fmla z16.s, z30.s, z1.s[0]\n" + "ld1w { z30.s }, p2/Z, [%x[params], #-5, MUL VL]\n" + "fmla z23.s, z29.s, z6.s[2]\n" + "fmla z22.s, z29.s, z6.s[3]\n" + "fmla z21.s, z29.s, z7.s[0]\n" + "fmla z20.s, z29.s, z7.s[1]\n" + "fmla z19.s, z29.s, z0.s[2]\n" + "fmla z18.s, z29.s, z0.s[3]\n" + "fmla z17.s, z29.s, z1.s[0]\n" + "fmla z16.s, z29.s, z1.s[1]\n" + "ld1w { z29.s }, p2/Z, [%x[params], #-4, MUL VL]\n" + "fmla z23.s, z28.s, z6.s[3]\n" + "fmla z22.s, z28.s, z7.s[0]\n" + "fmla z21.s, z28.s, z7.s[1]\n" + "fmla z20.s, z28.s, z7.s[2]\n" + "fmla z19.s, z28.s, z0.s[3]\n" + "fmla z18.s, z28.s, z1.s[0]\n" + "fmla z17.s, z28.s, z1.s[1]\n" + "fmla z16.s, z28.s, z1.s[2]\n" + "ld1w { z28.s }, p2/Z, [%x[params], #-3, MUL VL]\n" + "fmla z23.s, z27.s, z7.s[0]\n" + "fmla z22.s, z27.s, z7.s[1]\n" + "fmla z21.s, z27.s, z7.s[2]\n" + "fmla z20.s, z27.s, z7.s[3]\n" + "fmla z19.s, z27.s, z1.s[0]\n" + "fmla z18.s, z27.s, z1.s[1]\n" + "fmla z17.s, z27.s, z1.s[2]\n" + "fmla z16.s, z27.s, z1.s[3]\n" + "ld1w { z27.s }, p2/Z, [%x[params], #-2, MUL VL]\n" + "fmla z23.s, z31.s, z0.s[0]\n" + "fmla z22.s, z31.s, z0.s[1]\n" + "fmla z21.s, z31.s, z0.s[2]\n" + "fmla z20.s, z31.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z19.s, z31.s, z0.s[0]\n" + "fmla z18.s, z31.s, z0.s[1]\n" + "fmla z17.s, z31.s, z0.s[2]\n" + "fmla z16.s, z31.s, z0.s[3]\n" + "ld1w { z31.s }, p2/Z, [%x[params], #-1, MUL VL]\n" + "mov z0.d, z8.d\n" + "fmla z23.s, z30.s, z0.s[1]\n" + "fmla z22.s, z30.s, z0.s[2]\n" + "fmla z21.s, z30.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z20.s, z30.s, z1.s[0]\n" + "mov z1.d, z11.d\n" + "fmla z19.s, z30.s, z0.s[1]\n" + "fmla z18.s, z30.s, z0.s[2]\n" + "fmla z17.s, z30.s, z0.s[3]\n" + "mov z0.d, z8.d\n" + "fmla z16.s, z30.s, z1.s[0]\n" + "ld1w { z30.s }, p2/Z, [%x[params]]\n" + "mov z1.d, z9.d\n" + "fmla z23.s, z29.s, z0.s[2]\n" + "fmla z22.s, z29.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z21.s, z29.s, z1.s[0]\n" + "fmla z20.s, z29.s, z1.s[1]\n" + "mov z1.d, z11.d\n" + "fmla z19.s, z29.s, z0.s[2]\n" + "fmla z18.s, z29.s, z0.s[3]\n" + "mov z0.d, z8.d\n" + "fmla z17.s, z29.s, z1.s[0]\n" + "fmla z16.s, z29.s, z1.s[1]\n" + "ld1w { z29.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "mov z1.d, z9.d\n" + "fmla z23.s, z28.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z22.s, z28.s, z1.s[0]\n" + "fmla z21.s, z28.s, z1.s[1]\n" + "fmla z20.s, z28.s, z1.s[2]\n" + "mov z1.d, z11.d\n" + "fmla z19.s, z28.s, z0.s[3]\n" + "fmla z18.s, z28.s, z1.s[0]\n" + "fmla z17.s, z28.s, z1.s[1]\n" + "fmla z16.s, z28.s, z1.s[2]\n" + "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "mov z1.d, z9.d\n" + "fmla z23.s, z27.s, z1.s[0]\n" + "fmla z22.s, z27.s, z1.s[1]\n" + "fmla z21.s, z27.s, z1.s[2]\n" + "fmla z20.s, z27.s, z1.s[3]\n" + "mov z1.d, z11.d\n" + "fmla z19.s, z27.s, z1.s[0]\n" + "fmla z18.s, z27.s, z1.s[1]\n" + "fmla z17.s, z27.s, z1.s[2]\n" + "fmla z16.s, z27.s, z1.s[3]\n" + "ld1w { z27.s }, p2/Z, [%x[params], #3, MUL VL]\n" + "fmla z23.s, z31.s, z0.s[0]\n" + "fmla z22.s, z31.s, z0.s[1]\n" + "fmla z21.s, z31.s, z0.s[2]\n" + "fmla z20.s, z31.s, z0.s[3]\n" + "mov z0.d, z12.d\n" + "fmla z19.s, z31.s, z0.s[0]\n" + "fmla z18.s, z31.s, z0.s[1]\n" + "fmla z17.s, z31.s, z0.s[2]\n" + "fmla z16.s, z31.s, z0.s[3]\n" + "ld1w { z31.s }, p1/Z, [%x[params], #5, MUL VL]\n" + "mov z0.d, z10.d\n" + "fmla z23.s, z30.s, z0.s[1]\n" + "fmla z22.s, z30.s, z0.s[2]\n" + "fmla z21.s, z30.s, z0.s[3]\n" + "mov z0.d, z12.d\n" + "fmla z20.s, z30.s, z1.s[0]\n" + "mov z1.d, z13.d\n" + "fmla z19.s, z30.s, z0.s[1]\n" + "fmla z18.s, z30.s, z0.s[2]\n" + "fmla z17.s, z30.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z16.s, z30.s, z1.s[0]\n" + "ld1w { z30.s }, p1/Z, [%x[params], #6, MUL VL]\n" + "mov z1.d, z11.d\n" + "fmla z23.s, z29.s, z0.s[2]\n" + "fmla z22.s, z29.s, z0.s[3]\n" + "mov z0.d, z12.d\n" + "fmla z21.s, z29.s, z1.s[0]\n" + "fmla z20.s, z29.s, z1.s[1]\n" + "mov z1.d, z13.d\n" + "fmla z19.s, z29.s, z0.s[2]\n" + "fmla z18.s, z29.s, z0.s[3]\n" + "mov z0.d, z10.d\n" + "fmla z17.s, z29.s, z1.s[0]\n" + "fmla z16.s, z29.s, z1.s[1]\n" + "ld1w { z29.s }, p1/Z, [%x[params], #7, MUL VL]\n" + "mov z1.d, z11.d\n" + "fmla z23.s, z28.s, z0.s[3]\n" + "mov z0.d, z12.d\n" + "fmla z22.s, z28.s, z1.s[0]\n" + "fmla z21.s, z28.s, z1.s[1]\n" + "fmla z20.s, z28.s, z1.s[2]\n" + "mov z1.d, z13.d\n" + "fmla z19.s, z28.s, z0.s[3]\n" + "fmla z18.s, z28.s, z1.s[0]\n" + "fmla z17.s, z28.s, z1.s[1]\n" + "fmla z16.s, z28.s, z1.s[2]\n" + "mov z1.d, z11.d\n" + "fmla z23.s, z27.s, z1.s[0]\n" + "fmla z22.s, z27.s, z1.s[1]\n" + "fmla z21.s, z27.s, z1.s[2]\n" + "fmla z20.s, z27.s, z1.s[3]\n" + "mov z1.d, z13.d\n" + "fmla z19.s, z27.s, z1.s[0]\n" + "fmla z18.s, z27.s, z1.s[1]\n" + "fmla z17.s, z27.s, z1.s[2]\n" + "fmla z16.s, z27.s, z1.s[3]\n" + "fmin z23.s, p2/M, z23.s, z24.s\n" + "fmin z22.s, p2/M, z22.s, z24.s\n" + "fmin z21.s, p2/M, z21.s, z24.s\n" + "fmin z20.s, p2/M, z20.s, z24.s\n" + "fmax z23.s, p2/M, z23.s, z25.s\n" + "st1w { z23.s }, p0, [x11, x24, LSL #2]\n" + "fmax z22.s, p2/M, z22.s, z25.s\n" + "fmax z21.s, p2/M, z21.s, z25.s\n" + "ld1w { z23.s }, p1/Z, [%x[params], #4, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "fmax z20.s, p2/M, z20.s, z25.s\n" + "ld1w { z28.s }, p1/Z, [%x[params], #-8, MUL VL]\n" + "fmin z19.s, p2/M, z19.s, z24.s\n" + "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n" + "addvl %x[params], %x[params], #-6\n" + "fmin z18.s, p2/M, z18.s, z24.s\n" + "st1w { z22.s }, p0, [x10, x24, LSL #2]\n" + "mov z22.d, z23.d\n" + "st1w { z21.s }, p0, [x9, x24, LSL #2]\n" + "mov z21.d, z23.d\n" + "st1w { z20.s }, p0, [x28, x24, LSL #2]\n" + "mov z20.d, z23.d\n" + "fmax z19.s, p2/M, z19.s, z25.s\n" + "st1w { z19.s }, p0, [x26, x24, LSL #2]\n" + "mov z19.d, z23.d\n" + "fmax z18.s, p2/M, z18.s, z25.s\n" + "st1w { z18.s }, p0, [x25, x24, LSL #2]\n" + "mov z18.d, z23.d\n" + "fmin z17.s, p2/M, z17.s, z24.s\n" + "fmin z16.s, p2/M, z16.s, z24.s\n" + "fmax z17.s, p2/M, z17.s, z25.s\n" + "st1w { z17.s }, p0, [x23, x24, LSL #2]\n" + "mov z17.d, z23.d\n" + "fmax z16.s, p2/M, z16.s, z25.s\n" + "st1w { z16.s }, p0, [x22, x24, LSL #2]\n" + "mov z16.d, z23.d\n" + "incw x24\n" + "b.any 1b\n" + : [params] "+&r" (params) + : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp new file mode 100644 index 0000000000..7a4bd1dd1e --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); + +struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +{ + typedef float bias_type; + typedef float input_type; + typedef float weight_type; + typedef float return_type; + + typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int output_rows(void) { return 2; }; + constexpr static unsigned int output_cols(void) { return 8; }; + + constexpr static unsigned int output_col_regs(void) { return 2; }; + + kern_type kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + + sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..0124370067 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl( + const float *const *const inptrs, + float *const *const outptrs, + const float *weights, + const float *bias, + const unsigned int kernel_points, + const unsigned int n_output_channels, + const float activation_min, + const float activation_max +) +{ + const float minmax_vals[2] = { activation_min, activation_max }; + + __asm__ __volatile__( + "ptrue p1.b\n" + "ld1rw { z11.s }, p1/Z, [%x[minmax_vals]]\n" + "mov x28, #0x0\n" + "ld1rw { z10.s }, p1/Z, [%x[minmax_vals], #4]\n" + "whilelt p0.s, x28, %x[n_output_channels]\n" + "1:" // Output channel loop + "mov z16.b, #0x0\n" + "cbz %x[bias], 2f\n" + "ld1w { z16.s }, p0/Z, [%x[bias], x28, LSL #2]\n" + "2:" // Output channel loop: Load bias: Done + "mov z9.d, z16.d\n" + "ld1w { z8.s }, p1/Z, [%x[weights]]\n" + "mov x20, %x[inptrs]\n" + "mov z31.d, z16.d\n" + "ldp x24, x27, [x20], #0x10\n" + "lsr x19, %x[kernel_points], #0x1\n" + "mov z30.d, z16.d\n" + "ld1rqw { z7.s }, p1/Z, [x24]\n" + "mov z29.d, z16.d\n" + "addvl %x[weights], %x[weights], #1\n" + "mov z28.d, z16.d\n" + "ld1rqw { z6.s }, p1/Z, [x24, #16]\n" + "mov z27.d, z16.d\n" + "ld1rqw { z5.s }, p1/Z, [x27]\n" + "mov z26.d, z16.d\n" + "ld1rqw { z4.s }, p1/Z, [x27, #16]\n" + "mov z25.d, z16.d\n" + "mov z24.d, z16.d\n" + "mov z23.d, z16.d\n" + "mov z22.d, z16.d\n" + "mov z21.d, z16.d\n" + "mov z20.d, z16.d\n" + "mov z19.d, z16.d\n" + "mov z18.d, z16.d\n" + "mov z17.d, z16.d\n" + "cbz x19, 6f\n" + "ldp x24, x27, [x20], #0x10\n" + "ld1w { z16.s }, p1/Z, [%x[weights]]\n" + "subs x19, x19, #0x1\n" + "addvl %x[weights], %x[weights], #1\n" + "ld1rqw { z3.s }, p1/Z, [x24]\n" + "ld1rqw { z2.s }, p1/Z, [x24, #16]\n" + "ld1rqw { z1.s }, p1/Z, [x27]\n" + "ld1rqw { z0.s }, p1/Z, [x27, #16]\n" + "beq 4f\n" + "3:" // Output channel loop: Kernel loop + "fmla z9.s, z8.s, z7.s[0]\n" + "ldp x24, x27, [x20], #0x10\n" + "subs x19, x19, #0x1\n" + "fmla z31.s, z8.s, z7.s[1]\n" + "fmla z30.s, z8.s, z7.s[2]\n" + "fmla z29.s, z8.s, z7.s[3]\n" + "ld1rqw { z7.s }, p1/Z, [x24]\n" + "fmla z28.s, z8.s, z6.s[0]\n" + "fmla z27.s, z8.s, z6.s[1]\n" + "fmla z26.s, z8.s, z6.s[2]\n" + "fmla z25.s, z8.s, z6.s[3]\n" + "ld1rqw { z6.s }, p1/Z, [x24, #16]\n" + "fmla z24.s, z8.s, z5.s[0]\n" + "fmla z23.s, z8.s, z5.s[1]\n" + "fmla z22.s, z8.s, z5.s[2]\n" + "fmla z21.s, z8.s, z5.s[3]\n" + "ld1rqw { z5.s }, p1/Z, [x27]\n" + "fmla z20.s, z8.s, z4.s[0]\n" + "fmla z19.s, z8.s, z4.s[1]\n" + "fmla z18.s, z8.s, z4.s[2]\n" + "fmla z17.s, z8.s, z4.s[3]\n" + "ld1rqw { z4.s }, p1/Z, [x27, #16]\n" + "fmla z9.s, z16.s, z3.s[0]\n" + "ld1w { z8.s }, p1/Z, [%x[weights]]\n" + "fmla z31.s, z16.s, z3.s[1]\n" + "ldp x24, x27, [x20], #0x10\n" + "fmla z30.s, z16.s, z3.s[2]\n" + "fmla z29.s, z16.s, z3.s[3]\n" + "ld1rqw { z3.s }, p1/Z, [x24]\n" + "fmla z28.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z2.s[1]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z25.s, z16.s, z2.s[3]\n" + "ld1rqw { z2.s }, p1/Z, [x24, #16]\n" + "fmla z24.s, z16.s, z1.s[0]\n" + "fmla z23.s, z16.s, z1.s[1]\n" + "fmla z22.s, z16.s, z1.s[2]\n" + "fmla z21.s, z16.s, z1.s[3]\n" + "ld1rqw { z1.s }, p1/Z, [x27]\n" + "fmla z20.s, z16.s, z0.s[0]\n" + "fmla z19.s, z16.s, z0.s[1]\n" + "fmla z18.s, z16.s, z0.s[2]\n" + "fmla z17.s, z16.s, z0.s[3]\n" + "ld1rqw { z0.s }, p1/Z, [x27, #16]\n" + "ld1w { z16.s }, p1/Z, [%x[weights], #1, MUL VL]\n" + "addvl %x[weights], %x[weights], #2\n" + "bgt 3b\n" + "4:" // Output channel loop: Kernel loop tail + "tbnz %x[kernel_points], #0, 5f\n" + "fmla z9.s, z8.s, z7.s[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla z31.s, z8.s, z7.s[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla z30.s, z8.s, z7.s[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla z29.s, z8.s, z7.s[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla z28.s, z8.s, z6.s[0]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla z27.s, z8.s, z6.s[1]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla z26.s, z8.s, z6.s[2]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla z25.s, z8.s, z6.s[3]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla z24.s, z8.s, z5.s[0]\n" + "fmla z23.s, z8.s, z5.s[1]\n" + "fmla z22.s, z8.s, z5.s[2]\n" + "fmla z21.s, z8.s, z5.s[3]\n" + "fmla z20.s, z8.s, z4.s[0]\n" + "fmla z19.s, z8.s, z4.s[1]\n" + "fmla z18.s, z8.s, z4.s[2]\n" + "fmla z17.s, z8.s, z4.s[3]\n" + "fmla z9.s, z16.s, z3.s[0]\n" + "fmla z31.s, z16.s, z3.s[1]\n" + "fmla z30.s, z16.s, z3.s[2]\n" + "fmla z29.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z2.s[1]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z25.s, z16.s, z2.s[3]\n" + "fmla z24.s, z16.s, z1.s[0]\n" + "fmla z23.s, z16.s, z1.s[1]\n" + "fmla z22.s, z16.s, z1.s[2]\n" + "fmla z21.s, z16.s, z1.s[3]\n" + "fmla z20.s, z16.s, z0.s[0]\n" + "fmla z19.s, z16.s, z0.s[1]\n" + "fmla z18.s, z16.s, z0.s[2]\n" + "fmla z17.s, z16.s, z0.s[3]\n" + "fmin z9.s, p1/M, z9.s, z10.s\n" + "fmin z31.s, p1/M, z31.s, z10.s\n" + "fmin z30.s, p1/M, z30.s, z10.s\n" + "fmin z29.s, p1/M, z29.s, z10.s\n" + "fmax z9.s, p1/M, z9.s, z11.s\n" + "st1w { z9.s }, p0, [x19, x28, LSL #2]\n" + "fmax z31.s, p1/M, z31.s, z11.s\n" + "fmax z30.s, p1/M, z30.s, z11.s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmax z29.s, p1/M, z29.s, z11.s\n" + "st1w { z31.s }, p0, [x20, x28, LSL #2]\n" + "fmin z28.s, p1/M, z28.s, z10.s\n" + "fmin z27.s, p1/M, z27.s, z10.s\n" + "st1w { z30.s }, p0, [x21, x28, LSL #2]\n" + "fmin z26.s, p1/M, z26.s, z10.s\n" + "st1w { z29.s }, p0, [x22, x28, LSL #2]\n" + "fmin z25.s, p1/M, z25.s, z10.s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin z24.s, p1/M, z24.s, z10.s\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax z28.s, p1/M, z28.s, z11.s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax z27.s, p1/M, z27.s, z11.s\n" + "st1w { z28.s }, p0, [x23, x28, LSL #2]\n" + "fmax z26.s, p1/M, z26.s, z11.s\n" + "fmax z25.s, p1/M, z25.s, z11.s\n" + "st1w { z27.s }, p0, [x24, x28, LSL #2]\n" + "fmax z24.s, p1/M, z24.s, z11.s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmin z23.s, p1/M, z23.s, z10.s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmin z22.s, p1/M, z22.s, z10.s\n" + "st1w { z26.s }, p0, [x25, x28, LSL #2]\n" + "fmin z21.s, p1/M, z21.s, z10.s\n" + "st1w { z25.s }, p0, [x26, x28, LSL #2]\n" + "fmin z20.s, p1/M, z20.s, z10.s\n" + "st1w { z24.s }, p0, [x19, x28, LSL #2]\n" + "fmax z23.s, p1/M, z23.s, z11.s\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax z22.s, p1/M, z22.s, z11.s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax z21.s, p1/M, z21.s, z11.s\n" + "st1w { z23.s }, p0, [x20, x28, LSL #2]\n" + "fmax z20.s, p1/M, z20.s, z11.s\n" + "fmin z19.s, p1/M, z19.s, z10.s\n" + "st1w { z22.s }, p0, [x21, x28, LSL #2]\n" + "fmin z18.s, p1/M, z18.s, z10.s\n" + "st1w { z21.s }, p0, [x22, x28, LSL #2]\n" + "fmin z17.s, p1/M, z17.s, z10.s\n" + "st1w { z20.s }, p0, [x23, x28, LSL #2]\n" + "fmax z19.s, p1/M, z19.s, z11.s\n" + "fmax z18.s, p1/M, z18.s, z11.s\n" + "st1w { z19.s }, p0, [x24, x28, LSL #2]\n" + "fmax z17.s, p1/M, z17.s, z11.s\n" + "st1w { z18.s }, p0, [x25, x28, LSL #2]\n" + "st1w { z17.s }, p0, [x26, x28, LSL #2]\n" + "b 7f\n" + "5:" // Output channel loop: Odd tail + "fmla z9.s, z8.s, z7.s[0]\n" + "ldp x24, x27, [x20], #0x10\n" + "fmla z31.s, z8.s, z7.s[1]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla z30.s, z8.s, z7.s[2]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla z29.s, z8.s, z7.s[3]\n" + "ld1rqw { z7.s }, p1/Z, [x24]\n" + "fmla z28.s, z8.s, z6.s[0]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla z27.s, z8.s, z6.s[1]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla z26.s, z8.s, z6.s[2]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla z25.s, z8.s, z6.s[3]\n" + "ld1rqw { z6.s }, p1/Z, [x24, #16]\n" + "fmla z24.s, z8.s, z5.s[0]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla z23.s, z8.s, z5.s[1]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla z22.s, z8.s, z5.s[2]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla z21.s, z8.s, z5.s[3]\n" + "ld1rqw { z5.s }, p1/Z, [x27]\n" + "fmla z20.s, z8.s, z4.s[0]\n" + "fmla z19.s, z8.s, z4.s[1]\n" + "fmla z18.s, z8.s, z4.s[2]\n" + "fmla z17.s, z8.s, z4.s[3]\n" + "ld1rqw { z4.s }, p1/Z, [x27, #16]\n" + "fmla z9.s, z16.s, z3.s[0]\n" + "ld1w { z8.s }, p1/Z, [%x[weights]]\n" + "addvl %x[weights], %x[weights], #1\n" + "fmla z31.s, z16.s, z3.s[1]\n" + "fmla z30.s, z16.s, z3.s[2]\n" + "fmla z29.s, z16.s, z3.s[3]\n" + "fmla z28.s, z16.s, z2.s[0]\n" + "fmla z27.s, z16.s, z2.s[1]\n" + "fmla z26.s, z16.s, z2.s[2]\n" + "fmla z25.s, z16.s, z2.s[3]\n" + "fmla z24.s, z16.s, z1.s[0]\n" + "fmla z23.s, z16.s, z1.s[1]\n" + "fmla z22.s, z16.s, z1.s[2]\n" + "fmla z21.s, z16.s, z1.s[3]\n" + "fmla z20.s, z16.s, z0.s[0]\n" + "fmla z19.s, z16.s, z0.s[1]\n" + "fmla z18.s, z16.s, z0.s[2]\n" + "fmla z17.s, z16.s, z0.s[3]\n" + "fmla z9.s, z8.s, z7.s[0]\n" + "fmla z31.s, z8.s, z7.s[1]\n" + "fmla z30.s, z8.s, z7.s[2]\n" + "fmla z29.s, z8.s, z7.s[3]\n" + "fmla z28.s, z8.s, z6.s[0]\n" + "fmla z27.s, z8.s, z6.s[1]\n" + "fmla z26.s, z8.s, z6.s[2]\n" + "fmla z25.s, z8.s, z6.s[3]\n" + "fmla z24.s, z8.s, z5.s[0]\n" + "fmla z23.s, z8.s, z5.s[1]\n" + "fmla z22.s, z8.s, z5.s[2]\n" + "fmla z21.s, z8.s, z5.s[3]\n" + "fmla z20.s, z8.s, z4.s[0]\n" + "fmla z19.s, z8.s, z4.s[1]\n" + "fmla z18.s, z8.s, z4.s[2]\n" + "fmla z17.s, z8.s, z4.s[3]\n" + "fmin z9.s, p1/M, z9.s, z10.s\n" + "fmin z31.s, p1/M, z31.s, z10.s\n" + "fmin z30.s, p1/M, z30.s, z10.s\n" + "fmin z29.s, p1/M, z29.s, z10.s\n" + "fmax z9.s, p1/M, z9.s, z11.s\n" + "st1w { z9.s }, p0, [x19, x28, LSL #2]\n" + "fmax z31.s, p1/M, z31.s, z11.s\n" + "fmax z30.s, p1/M, z30.s, z11.s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmax z29.s, p1/M, z29.s, z11.s\n" + "st1w { z31.s }, p0, [x20, x28, LSL #2]\n" + "fmin z28.s, p1/M, z28.s, z10.s\n" + "fmin z27.s, p1/M, z27.s, z10.s\n" + "st1w { z30.s }, p0, [x21, x28, LSL #2]\n" + "fmin z26.s, p1/M, z26.s, z10.s\n" + "st1w { z29.s }, p0, [x22, x28, LSL #2]\n" + "fmin z25.s, p1/M, z25.s, z10.s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin z24.s, p1/M, z24.s, z10.s\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax z28.s, p1/M, z28.s, z11.s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax z27.s, p1/M, z27.s, z11.s\n" + "st1w { z28.s }, p0, [x23, x28, LSL #2]\n" + "fmax z26.s, p1/M, z26.s, z11.s\n" + "fmax z25.s, p1/M, z25.s, z11.s\n" + "st1w { z27.s }, p0, [x24, x28, LSL #2]\n" + "fmax z24.s, p1/M, z24.s, z11.s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmin z23.s, p1/M, z23.s, z10.s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmin z22.s, p1/M, z22.s, z10.s\n" + "st1w { z26.s }, p0, [x25, x28, LSL #2]\n" + "fmin z21.s, p1/M, z21.s, z10.s\n" + "st1w { z25.s }, p0, [x26, x28, LSL #2]\n" + "fmin z20.s, p1/M, z20.s, z10.s\n" + "st1w { z24.s }, p0, [x19, x28, LSL #2]\n" + "fmax z23.s, p1/M, z23.s, z11.s\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax z22.s, p1/M, z22.s, z11.s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax z21.s, p1/M, z21.s, z11.s\n" + "st1w { z23.s }, p0, [x20, x28, LSL #2]\n" + "fmax z20.s, p1/M, z20.s, z11.s\n" + "fmin z19.s, p1/M, z19.s, z10.s\n" + "st1w { z22.s }, p0, [x21, x28, LSL #2]\n" + "fmin z18.s, p1/M, z18.s, z10.s\n" + "st1w { z21.s }, p0, [x22, x28, LSL #2]\n" + "fmin z17.s, p1/M, z17.s, z10.s\n" + "st1w { z20.s }, p0, [x23, x28, LSL #2]\n" + "fmax z19.s, p1/M, z19.s, z11.s\n" + "fmax z18.s, p1/M, z18.s, z11.s\n" + "st1w { z19.s }, p0, [x24, x28, LSL #2]\n" + "fmax z17.s, p1/M, z17.s, z11.s\n" + "st1w { z18.s }, p0, [x25, x28, LSL #2]\n" + "st1w { z17.s }, p0, [x26, x28, LSL #2]\n" + "b 7f\n" + "6:" // Output channel loop: Single kernel point + "fmla z9.s, z8.s, z7.s[0]\n" + "ldr x19, [%x[outptrs], #0x0]\n" + "fmla z31.s, z8.s, z7.s[1]\n" + "ldr x20, [%x[outptrs], #0x8]\n" + "fmla z30.s, z8.s, z7.s[2]\n" + "ldr x21, [%x[outptrs], #0x10]\n" + "fmla z29.s, z8.s, z7.s[3]\n" + "ldr x22, [%x[outptrs], #0x18]\n" + "fmla z28.s, z8.s, z6.s[0]\n" + "ldr x23, [%x[outptrs], #0x20]\n" + "fmla z27.s, z8.s, z6.s[1]\n" + "ldr x24, [%x[outptrs], #0x28]\n" + "fmla z26.s, z8.s, z6.s[2]\n" + "ldr x25, [%x[outptrs], #0x30]\n" + "fmla z25.s, z8.s, z6.s[3]\n" + "ldr x26, [%x[outptrs], #0x38]\n" + "fmla z24.s, z8.s, z5.s[0]\n" + "fmla z23.s, z8.s, z5.s[1]\n" + "fmla z22.s, z8.s, z5.s[2]\n" + "fmla z21.s, z8.s, z5.s[3]\n" + "fmla z20.s, z8.s, z4.s[0]\n" + "fmla z19.s, z8.s, z4.s[1]\n" + "fmla z18.s, z8.s, z4.s[2]\n" + "fmla z17.s, z8.s, z4.s[3]\n" + "fmin z9.s, p1/M, z9.s, z10.s\n" + "fmin z31.s, p1/M, z31.s, z10.s\n" + "fmin z30.s, p1/M, z30.s, z10.s\n" + "fmin z29.s, p1/M, z29.s, z10.s\n" + "fmax z9.s, p1/M, z9.s, z11.s\n" + "st1w { z9.s }, p0, [x19, x28, LSL #2]\n" + "fmax z31.s, p1/M, z31.s, z11.s\n" + "fmax z30.s, p1/M, z30.s, z11.s\n" + "ldr x19, [%x[outptrs], #0x40]\n" + "fmax z29.s, p1/M, z29.s, z11.s\n" + "st1w { z31.s }, p0, [x20, x28, LSL #2]\n" + "fmin z28.s, p1/M, z28.s, z10.s\n" + "fmin z27.s, p1/M, z27.s, z10.s\n" + "st1w { z30.s }, p0, [x21, x28, LSL #2]\n" + "fmin z26.s, p1/M, z26.s, z10.s\n" + "st1w { z29.s }, p0, [x22, x28, LSL #2]\n" + "fmin z25.s, p1/M, z25.s, z10.s\n" + "ldr x20, [%x[outptrs], #0x48]\n" + "fmin z24.s, p1/M, z24.s, z10.s\n" + "ldr x21, [%x[outptrs], #0x50]\n" + "fmax z28.s, p1/M, z28.s, z11.s\n" + "ldr x22, [%x[outptrs], #0x58]\n" + "fmax z27.s, p1/M, z27.s, z11.s\n" + "st1w { z28.s }, p0, [x23, x28, LSL #2]\n" + "fmax z26.s, p1/M, z26.s, z11.s\n" + "fmax z25.s, p1/M, z25.s, z11.s\n" + "st1w { z27.s }, p0, [x24, x28, LSL #2]\n" + "fmax z24.s, p1/M, z24.s, z11.s\n" + "ldr x23, [%x[outptrs], #0x60]\n" + "fmin z23.s, p1/M, z23.s, z10.s\n" + "ldr x24, [%x[outptrs], #0x68]\n" + "fmin z22.s, p1/M, z22.s, z10.s\n" + "st1w { z26.s }, p0, [x25, x28, LSL #2]\n" + "fmin z21.s, p1/M, z21.s, z10.s\n" + "st1w { z25.s }, p0, [x26, x28, LSL #2]\n" + "fmin z20.s, p1/M, z20.s, z10.s\n" + "st1w { z24.s }, p0, [x19, x28, LSL #2]\n" + "fmax z23.s, p1/M, z23.s, z11.s\n" + "ldr x25, [%x[outptrs], #0x70]\n" + "fmax z22.s, p1/M, z22.s, z11.s\n" + "ldr x26, [%x[outptrs], #0x78]\n" + "fmax z21.s, p1/M, z21.s, z11.s\n" + "st1w { z23.s }, p0, [x20, x28, LSL #2]\n" + "fmax z20.s, p1/M, z20.s, z11.s\n" + "fmin z19.s, p1/M, z19.s, z10.s\n" + "st1w { z22.s }, p0, [x21, x28, LSL #2]\n" + "fmin z18.s, p1/M, z18.s, z10.s\n" + "st1w { z21.s }, p0, [x22, x28, LSL #2]\n" + "fmin z17.s, p1/M, z17.s, z10.s\n" + "st1w { z20.s }, p0, [x23, x28, LSL #2]\n" + "fmax z19.s, p1/M, z19.s, z11.s\n" + "fmax z18.s, p1/M, z18.s, z11.s\n" + "st1w { z19.s }, p0, [x24, x28, LSL #2]\n" + "fmax z17.s, p1/M, z17.s, z11.s\n" + "st1w { z18.s }, p0, [x25, x28, LSL #2]\n" + "st1w { z17.s }, p0, [x26, x28, LSL #2]\n" + "7:" // Output channel loop: Done + "incw x28\n" + "whilelt p0.s, x28, %x[n_output_channels]\n" + "b.any 1b\n" + : [weights] "+&r" (weights) + : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs) + : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..295e1f6450 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + +struct sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size; + + kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + + sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..90f924a8ed --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_SVE) + +#include "arm_gemm.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +{ + __asm__ __volatile__( + "ldp x11, x10, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "ldp x9, x28, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "ldp x27, x26, [%x[inptrs], #0x20]\n" + "mov x19, #0x1\n" + "ldp x25, x24, [%x[inptrs], #0x30]\n" + "orr x19, x19, #0x100\n" + "ldp x23, x22, [%x[outptrs], #0x0]\n" + "orr x19, x19, #0x10000\n" + "dup z12.s, w19\n" + "ldp x21, x20, [%x[outptrs], #0x10]\n" + "mov x19, #0x0\n" + "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "whilelt p1.b, x19, %x[n_channels]\n" + "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n" + "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "1:" // Loop + "mov z7.s, #0x0\n" + "ld1b { z19.b }, p1/Z, [x11, x19]\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "mov z6.s, #0x0\n" + "ld1b { z18.b }, p1/Z, [x10, x19]\n" + "ldp x11, x10, [%x[inptrs], #0x40]\n" + "ld1b { z16.b }, p1/Z, [x9, x19]\n" + "zip1 z21.b, z19.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x28, x19]\n" + "zip2 z19.b, z19.b, z16.b\n" + "ldp x9, x28, [%x[inptrs], #0x50]\n" + "ld1b { z23.b }, p1/Z, [x27, x19]\n" + "zip1 z16.b, z18.b, z17.b\n" + "ld1b { z20.b }, p1/Z, [x26, x19]\n" + "zip2 z18.b, z18.b, z17.b\n" + "ldp x27, x26, [%x[inptrs], #0x60]\n" + "zip1 z5.b, z21.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x25, x19]\n" + "zip2 z4.b, z21.b, z16.b\n" + "ld1b { z16.b }, p1/Z, [x24, x19]\n" + "zip1 z29.b, z19.b, z18.b\n" + "ldp x25, x24, [%x[inptrs], #0x70]\n" + "zip2 z28.b, z19.b, z18.b\n" + "ld1b { z22.b }, p1/Z, [x11, x19]\n" + "zip1 z19.b, z23.b, z17.b\n" + "ld1b { z21.b }, p1/Z, [x10, x19]\n" + "zip2 z27.b, z23.b, z17.b\n" + "ldp x11, x10, [%x[inptrs], #0x0]\n" + "zip1 z18.b, z20.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x9, x19]\n" + "zip2 z20.b, z20.b, z16.b\n" + "ld1b { z16.b }, p1/Z, [x28, x19]\n" + "zip1 z3.b, z19.b, z18.b\n" + "ldp x9, x28, [%x[inptrs], #0x10]\n" + "zip2 z2.b, z19.b, z18.b\n" + "ld1b { z19.b }, p1/Z, [x27, x19]\n" + "zip1 z26.b, z22.b, z17.b\n" + "ld1b { z25.b }, p1/Z, [x26, x19]\n" + "zip2 z24.b, z22.b, z17.b\n" + "ldp x27, x26, [%x[inptrs], #0x20]\n" + "zip1 z23.b, z21.b, z16.b\n" + "ld1b { z18.b }, p1/Z, [x25, x19]\n" + "zip2 z22.b, z21.b, z16.b\n" + "ld1b { z21.b }, p1/Z, [x24, x19]\n" + "zip1 z17.b, z27.b, z20.b\n" + "ldp x25, x24, [%x[inptrs], #0x30]\n" + "zip2 z16.b, z27.b, z20.b\n" + "st1b { z29.b }, p2, [SP]\n" + "zip1 z20.b, z19.b, z18.b\n" + "st1b { z28.b }, p2, [SP, #1, MUL VL]\n" + "zip2 z19.b, z19.b, z18.b\n" + "st1b { z17.b }, p2, [SP, #2, MUL VL]\n" + "zip1 z18.b, z25.b, z21.b\n" + "st1b { z16.b }, p2, [SP, #3, MUL VL]\n" + "zip2 z17.b, z25.b, z21.b\n" + "ld1w { z1.s }, p2/Z, [%x[params]]\n" + "zip1 z0.b, z26.b, z23.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n" + "zip2 z30.b, z26.b, z23.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n" + "zip1 z16.b, z24.b, z22.b\n" + "st1b { z16.b }, p2, [SP, #4, MUL VL]\n" + "zip2 z16.b, z24.b, z22.b\n" + "st1b { z16.b }, p2, [SP, #5, MUL VL]\n" + "zip1 z28.b, z20.b, z18.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "zip2 z26.b, z20.b, z18.b\n" + "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p2, [SP, #6, MUL VL]\n" + "zip2 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p2, [SP, #7, MUL VL]\n" + "mov z24.d, z1.d\n" + "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "mov z22.d, z1.d\n" + "mov z21.d, z1.d\n" + "sdot z1.s, z31.b, z5.b\n" + "sdot z22.s, z31.b, z3.b\n" + "sdot z7.s, z12.b, z3.b\n" + "sdot z1.s, z29.b, z3.b\n" + "ext z3.b, z3.b, z3.b, #0x1\n" + "sdot z22.s, z29.b, z0.b\n" + "sdot z7.s, z12.b, z0.b\n" + "sdot z1.s, z27.b, z0.b\n" + "ext z0.b, z0.b, z0.b, #0x1\n" + "sdot z22.s, z27.b, z28.b\n" + "mov z20.d, z7.d\n" + "sdot z7.s, z12.b, z5.b\n" + "sdot z20.s, z12.b, z28.b\n" + "ext z5.b, z5.b, z5.b, #0x1\n" + "ext z28.b, z28.b, z28.b, #0x1\n" + "sdot z21.s, z31.b, z3.b\n" + "sdot z6.s, z12.b, z3.b\n" + "sdot z24.s, z31.b, z5.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "sdot z21.s, z29.b, z0.b\n" + "sdot z6.s, z12.b, z0.b\n" + "sdot z24.s, z29.b, z3.b\n" + "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "sdot z21.s, z27.b, z28.b\n" + "mov z19.d, z6.d\n" + "sdot z24.s, z27.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n" + "sdot z6.s, z12.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [SP]\n" + "sdot z19.s, z12.b, z28.b\n" + "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "mov z7.s, #0x0\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "sdot z7.s, z12.b, z2.b\n" + "mov z6.s, #0x0\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "sqadd z1.s, z1.s, z16.s\n" + "sdot z7.s, z12.b, z30.b\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z17.d, z22.d, z23.d\n" + "mov z20.d, z7.d\n" + "asr z17.s, z17.s, #0x1f\n" + "sdot z7.s, z12.b, z4.b\n" + "sdot z20.s, z12.b, z26.b\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + "sqadd z24.s, z24.s, z18.s\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "sqadd z22.s, z22.s, z17.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + "add z1.s, z1.s, z8.s\n" + "and z16.d, z21.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "add z24.s, z24.s, z8.s\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z22.s, z22.s, z8.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "mov z24.d, z1.d\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "add z21.s, z21.s, z8.s\n" + "mov z22.d, z1.d\n" + "sdot z22.s, z31.b, z2.b\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "sdot z22.s, z29.b, z30.b\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "mov z21.d, z1.d\n" + "incw x19\n" + "sdot z1.s, z31.b, z4.b\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "sdot z22.s, z27.b, z26.b\n" + "ext z4.b, z4.b, z4.b, #0x1\n" + "ext z26.b, z26.b, z26.b, #0x1\n" + "sdot z1.s, z29.b, z2.b\n" + "ext z2.b, z2.b, z2.b, #0x1\n" + "sdot z24.s, z31.b, z4.b\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + "sdot z1.s, z27.b, z30.b\n" + "ext z30.b, z30.b, z30.b, #0x1\n" + "sdot z21.s, z31.b, z2.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n" + "sdot z24.s, z29.b, z2.b\n" + "sdot z6.s, z12.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + "sdot z21.s, z29.b, z30.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n" + "sdot z24.s, z27.b, z30.b\n" + "sdot z6.s, z12.b, z30.b\n" + "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n" + "and z17.d, z22.d, z23.d\n" + "asr z17.s, z17.s, #0x1f\n" + "sdot z21.s, z27.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n" + "mov z19.d, z6.d\n" + "sdot z6.s, z12.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n" + "sdot z19.s, z12.b, z26.b\n" + "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "mov z7.s, #0x0\n" + "sqadd z22.s, z22.s, z17.s\n" + "sdot z7.s, z12.b, z3.b\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "mov z6.s, #0x0\n" + "sdot z7.s, z12.b, z0.b\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "mov z20.d, z7.d\n" + "sdot z7.s, z12.b, z5.b\n" + "sdot z20.s, z12.b, z28.b\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "sqadd z1.s, z1.s, z16.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + "ld1w { z25.s }, p2/Z, [%x[params]]\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + "and z16.d, z21.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z24.s, z24.s, z18.s\n" + "add z22.s, z22.s, z8.s\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + "add z1.s, z1.s, z8.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "add z24.s, z24.s, z8.s\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z21.s, z21.s, z8.s\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "mov z24.d, z1.d\n" + "mov z22.d, z1.d\n" + "sdot z22.s, z31.b, z3.b\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "mov z21.d, z1.d\n" + "incw x19\n" + "sdot z1.s, z31.b, z5.b\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "sdot z22.s, z29.b, z0.b\n" + "ext z5.b, z5.b, z5.b, #0x1\n" + "sdot z1.s, z29.b, z3.b\n" + "sdot z22.s, z27.b, z28.b\n" + "ext z3.b, z3.b, z3.b, #0x1\n" + "ext z28.b, z28.b, z28.b, #0x1\n" + "sdot z24.s, z31.b, z5.b\n" + "sdot z1.s, z27.b, z0.b\n" + "ext z0.b, z0.b, z0.b, #0x1\n" + "sdot z21.s, z31.b, z3.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "sdot z24.s, z29.b, z3.b\n" + "sdot z6.s, z12.b, z3.b\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "sdot z21.s, z29.b, z0.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n" + "sdot z24.s, z27.b, z0.b\n" + "sdot z6.s, z12.b, z0.b\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "sdot z21.s, z27.b, z28.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n" + "mov z7.s, #0x0\n" + "mov z19.d, z6.d\n" + "sdot z6.s, z12.b, z5.b\n" + "sdot z19.s, z12.b, z28.b\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sdot z7.s, z12.b, z2.b\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "mov z6.s, #0x0\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "sqadd z1.s, z1.s, z16.s\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z17.d, z22.d, z23.d\n" + "and z16.d, z21.d, z23.d\n" + "asr z17.s, z17.s, #0x1f\n" + "sdot z7.s, z12.b, z30.b\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z24.s, z24.s, z18.s\n" + "add z1.s, z1.s, z8.s\n" + "mov z20.d, z7.d\n" + "sqadd z22.s, z22.s, z17.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "sdot z7.s, z12.b, z4.b\n" + "sdot z20.s, z12.b, z26.b\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z24.s, z24.s, z8.s\n" + "add z22.s, z22.s, z8.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "addvl %x[params], %x[params], #8\n" + "add z21.s, z21.s, z8.s\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "mov z24.d, z1.d\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "mov z22.d, z1.d\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "mov z21.d, z1.d\n" + "incw x19\n" + "sdot z1.s, z31.b, z4.b\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "sdot z22.s, z31.b, z2.b\n" + "ext z4.b, z4.b, z4.b, #0x1\n" + "sdot z1.s, z29.b, z2.b\n" + "sdot z22.s, z29.b, z30.b\n" + "ext z2.b, z2.b, z2.b, #0x1\n" + "sdot z24.s, z31.b, z4.b\n" + "sdot z1.s, z27.b, z30.b\n" + "sdot z22.s, z27.b, z26.b\n" + "ext z30.b, z30.b, z30.b, #0x1\n" + "ext z26.b, z26.b, z26.b, #0x1\n" + "sdot z21.s, z31.b, z2.b\n" + "sdot z24.s, z29.b, z2.b\n" + "sdot z6.s, z12.b, z2.b\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "sdot z21.s, z29.b, z30.b\n" + "sdot z24.s, z27.b, z30.b\n" + "sdot z6.s, z12.b, z30.b\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "sdot z21.s, z27.b, z26.b\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + "mov z19.d, z6.d\n" + "sdot z6.s, z12.b, z4.b\n" + "sdot z19.s, z12.b, z26.b\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "and z17.d, z22.d, z23.d\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z1.s, z1.s, z16.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z16.d, z21.d, z23.d\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z22.s, z22.s, z17.s\n" + "add z1.s, z1.s, z8.s\n" + "sqadd z24.s, z24.s, z18.s\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + "sqadd z21.s, z21.s, z16.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + "add z22.s, z22.s, z8.s\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z24.s, z24.s, z8.s\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "add z21.s, z21.s, z8.s\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "incw x19\n" + "whilelt p1.b, x19, %x[n_channels]\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..7dd241a8cf --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + +struct sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + + kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + + sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..8bf5badfaf --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + int8_t *const *const outptrs; + const int8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const int8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "mov x15, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x14, #0x0\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z12.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z18.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z13.s }, p4/Z, [x20]\n" + "whilelt p3.h, x15, x17\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "whilelt p2.s, x15, x17\n" + "ldp x10, x9, [x21, #0x0]\n" + "mov x19, x15\n" + "incw x19\n" + "ldp x28, x27, [x21, #0x10]\n" + "whilelt p1.s, x19, x17\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z17.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z17.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z17.s, z17.s, z16.s\n" + "mov z9.d, z11.d\n" + "ld1sb { z0.h }, p4/Z, [x16]\n" + ".inst 0x45521000 // ssublb z0.h, z0.b, z18.b\n" + "mov z20.d, z17.d\n" + "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n" + "mov z24.d, z11.d\n" + "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n" + ".inst 0x45521021 // ssublb z1.h, z1.b, z18.b\n" + "mov z19.d, z17.d\n" + "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n" + "mov z26.d, z11.d\n" + "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n" + ".inst 0x45521042 // ssublb z2.h, z2.b, z18.b\n" + "mov z23.d, z17.d\n" + "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n" + ".inst 0x45521063 // ssublb z3.h, z3.b, z18.b\n" + "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n" + "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n" + ".inst 0x45521084 // ssublb z4.h, z4.b, z18.b\n" + "inch x16, ALL, MUL #8\n" + "ld1sb { z8.h }, p4/Z, [x16]\n" + "ldp x23, x22, [x12, #0x0]\n" + ".inst 0x455210a5 // ssublb z5.h, z5.b, z18.b\n" + ".inst 0x455210c6 // ssublb z6.h, z6.b, z18.b\n" + "ldp x21, x20, [x12, #0x10]\n" + ".inst 0x455210e7 // ssublb z7.h, z7.b, z18.b\n" + ".inst 0x45521108 // ssublb z8.h, z8.b, z18.b\n" + "ldr x19, [x12, #0x20]\n" + "ld1sb { z31.h }, p3/Z, [x23, x15]\n" + ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n" + "ld1sb { z30.h }, p3/Z, [x22, x15]\n" + "ld1sb { z29.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n" + "ld1sb { z28.h }, p3/Z, [x20, x15]\n" + "ld1sb { z27.h }, p3/Z, [x19, x15]\n" + ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n" + ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n" + ".inst 0x454c137b // ssublb z27.h, z27.b, z12.b\n" + "1:" // Loop + ".inst 0x448443eb // smlalb z11.s, p4/M, z31.h, z4.h\n" + "ldr x21, [x12, #0x28]\n" + "whilelt p0.h, x14, x17\n" + ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n" + "ldr x20, [x12, #0x30]\n" + "inch x16\n" + ".inst 0x448343e9 // smlalb z9.s, p4/M, z31.h, z3.h\n" + "ldr x26, [x12, #0x38]\n" + ".inst 0x448347f4 // smlalt z20.s, p4/M, z31.h, z3.h\n" + "ldr x25, [x12, #0x40]\n" + ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n" + "ldr x19, [x12, #0x48]\n" + ".inst 0x448147f3 // smlalt z19.s, p4/M, z31.h, z1.h\n" + "ldr x24, [x12, #0x50]\n" + ".inst 0x448043fa // smlalb z26.s, p4/M, z31.h, z0.h\n" + "ldr x23, [x12, #0x58]\n" + ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n" + "ld1sb { z31.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n" + ".inst 0x448043cb // smlalb z11.s, p4/M, z30.h, z0.h\n" + "ldr x22, [x12, #0x60]\n" + ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n" + "ld1sb { z30.h }, p3/Z, [x19, x15]\n" + ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n" + ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n" + "ldr x21, [x12, #0x68]\n" + ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n" + "ld1sb { z29.h }, p3/Z, [x20, x15]\n" + ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n" + ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n" + "ldr x20, [x12, #0x70]\n" + ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n" + "ldr x19, [x12, #0x78]\n" + ".inst 0x44844389 // smlalb z9.s, p4/M, z28.h, z4.h\n" + "ld1w { z25.s }, p2/Z, [x13]\n" + ".inst 0x44844794 // smlalt z20.s, p4/M, z28.h, z4.h\n" + "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n" + "addvl x13, x13, #2\n" + ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n" + ".inst 0x44824793 // smlalt z19.s, p4/M, z28.h, z2.h\n" + ".inst 0x4481439a // smlalb z26.s, p4/M, z28.h, z1.h\n" + "uzp1 z10.s, z25.s, z16.s\n" + "uzp2 z22.s, z25.s, z16.s\n" + "ld1w { z25.s }, p2/Z, [x11]\n" + ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n" + "ld1sb { z28.h }, p3/Z, [x26, x15]\n" + ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n" + ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n" + "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n" + ".inst 0x448647f3 // smlalt z19.s, p4/M, z31.h, z6.h\n" + "ld1sb { z31.h }, p3/Z, [x25, x15]\n" + "addvl x11, x11, #2\n" + ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n" + ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n" + "uzp1 z21.s, z25.s, z16.s\n" + "uzp2 z25.s, z25.s, z16.s\n" + ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n" + ".inst 0x44864369 // smlalb z9.s, p4/M, z27.h, z6.h\n" + ".inst 0x44864774 // smlalt z20.s, p4/M, z27.h, z6.h\n" + ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n" + ".inst 0x44844773 // smlalt z19.s, p4/M, z27.h, z4.h\n" + ".inst 0x4483437a // smlalb z26.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834777 // smlalt z23.s, p4/M, z27.h, z3.h\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n" + ".inst 0x448843ba // smlalb z26.s, p4/M, z29.h, z8.h\n" + ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n" + "ld1sb { z29.h }, p3/Z, [x24, x15]\n" + ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n" + ".inst 0x44804389 // smlalb z9.s, p4/M, z28.h, z0.h\n" + ".inst 0x44804794 // smlalt z20.s, p4/M, z28.h, z0.h\n" + "ld1sb { z28.h }, p3/Z, [x23, x15]\n" + ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n" + ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n" + ".inst 0x448143e9 // smlalb z9.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147f4 // smlalt z20.s, p4/M, z31.h, z1.h\n" + "ld1sb { z31.h }, p3/Z, [x22, x15]\n" + ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n" + ".inst 0x448843cb // smlalb z11.s, p4/M, z30.h, z8.h\n" + ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n" + ".inst 0x448743c9 // smlalb z9.s, p4/M, z30.h, z7.h\n" + ".inst 0x448747d4 // smlalt z20.s, p4/M, z30.h, z7.h\n" + ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n" + ".inst 0x448547d3 // smlalt z19.s, p4/M, z30.h, z5.h\n" + ".inst 0x448443da // smlalb z26.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447d7 // smlalt z23.s, p4/M, z30.h, z4.h\n" + "ld1sb { z30.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n" + ".inst 0x448343ab // smlalb z11.s, p4/M, z29.h, z3.h\n" + ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n" + ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n" + ".inst 0x448047b3 // smlalt z19.s, p4/M, z29.h, z0.h\n" + "ld1sb { z29.h }, p3/Z, [x20, x15]\n" + ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n" + ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n" + ".inst 0x44854794 // smlalt z20.s, p4/M, z28.h, z5.h\n" + ".inst 0x4482439a // smlalb z26.s, p4/M, z28.h, z2.h\n" + ".inst 0x44824797 // smlalt z23.s, p4/M, z28.h, z2.h\n" + "ld1sb { z28.h }, p3/Z, [x19, x15]\n" + "inch x15\n" + ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n" + "whilelt p2.s, x15, x17\n" + ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n" + "mov x19, x15\n" + ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n" + ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n" + ".inst 0x448347f3 // smlalt z19.s, p4/M, z31.h, z3.h\n" + "incw x19\n" + ".inst 0x448843c9 // smlalb z9.s, p4/M, z30.h, z8.h\n" + "whilelt p1.s, x19, x17\n" + ".inst 0x04aa756b // sqrdmulh z11.s, z11.s, z10.s\n" + "whilelt p3.h, x15, x17\n" + ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n" + ".inst 0x448847d4 // smlalt z20.s, p4/M, z30.h, z8.h\n" + ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n" + "and z16.d, z11.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "and z1.d, z17.d, z25.d\n" + "and z27.d, z9.d, z21.d\n" + "asr z1.s, z1.s, #0x1f\n" + ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n" + ".inst 0x448543da // smlalb z26.s, p4/M, z30.h, z5.h\n" + "asr z27.s, z27.s, #0x1f\n" + ".inst 0x448547d7 // smlalt z23.s, p4/M, z30.h, z5.h\n" + "sqadd z11.s, z11.s, z16.s\n" + ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n" + "and z16.d, z20.d, z25.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z17.s, z17.s, z1.s\n" + "sqadd z9.s, z9.s, z27.s\n" + ".inst 0x448747b3 // smlalt z19.s, p4/M, z29.h, z7.h\n" + ".inst 0x448643ba // smlalb z26.s, p4/M, z29.h, z6.h\n" + ".inst 0x448647b7 // smlalt z23.s, p4/M, z29.h, z6.h\n" + ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n" + "sqadd z20.s, z20.s, z16.s\n" + ".inst 0x44884793 // smlalt z19.s, p4/M, z28.h, z8.h\n" + ".inst 0x4487439a // smlalb z26.s, p4/M, z28.h, z7.h\n" + ".inst 0x04aa7718 // sqrdmulh z24.s, z24.s, z10.s\n" + ".inst 0x44874797 // smlalt z23.s, p4/M, z28.h, z7.h\n" + ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n" + ".inst 0x04aa775a // sqrdmulh z26.s, z26.s, z10.s\n" + "and z16.d, z24.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "and z7.d, z19.d, z25.d\n" + "and z3.d, z26.d, z21.d\n" + "asr z7.s, z7.s, #0x1f\n" + ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n" + ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n" + "asr z3.s, z3.s, #0x1f\n" + ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n" + "sqadd z24.s, z24.s, z16.s\n" + ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n" + "add z11.s, z11.s, z15.s\n" + "add z17.s, z17.s, z15.s\n" + "sqadd z19.s, z19.s, z7.s\n" + "add z9.s, z9.s, z15.s\n" + "sqadd z26.s, z26.s, z3.s\n" + "and z16.d, z23.d, z25.d\n" + "asr z16.s, z16.s, #0x1f\n" + "smin z11.s, p4/M, z11.s, z14.s\n" + "smin z17.s, p4/M, z17.s, z14.s\n" + "smin z9.s, p4/M, z9.s, z14.s\n" + ".inst 0x44829334 // srshl z20.s, p4/M, z20.s, z25.s\n" + ".inst 0x448292b8 // srshl z24.s, p4/M, z24.s, z21.s\n" + "smax z11.s, p4/M, z11.s, z13.s\n" + "sqadd z23.s, z23.s, z16.s\n" + "add z20.s, z20.s, z15.s\n" + "add z24.s, z24.s, z15.s\n" + "smax z17.s, p4/M, z17.s, z13.s\n" + "smax z9.s, p4/M, z9.s, z13.s\n" + "smin z20.s, p4/M, z20.s, z14.s\n" + "smin z24.s, p4/M, z24.s, z14.s\n" + "trn1 z11.h, z11.h, z17.h\n" + "st1b { z11.h }, p0, [x10, x14]\n" + "smax z20.s, p4/M, z20.s, z13.s\n" + ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n" + "smax z24.s, p4/M, z24.s, z13.s\n" + ".inst 0x448292ba // srshl z26.s, p4/M, z26.s, z21.s\n" + ".inst 0x44829337 // srshl z23.s, p4/M, z23.s, z25.s\n" + "trn1 z9.h, z9.h, z20.h\n" + "st1b { z9.h }, p0, [x9, x14]\n" + "add z19.s, z19.s, z15.s\n" + "add z26.s, z26.s, z15.s\n" + "add z23.s, z23.s, z15.s\n" + "smin z19.s, p4/M, z19.s, z14.s\n" + "smin z26.s, p4/M, z26.s, z14.s\n" + "smin z23.s, p4/M, z23.s, z14.s\n" + "smax z19.s, p4/M, z19.s, z13.s\n" + "smax z26.s, p4/M, z26.s, z13.s\n" + "smax z23.s, p4/M, z23.s, z13.s\n" + "trn1 z24.h, z24.h, z19.h\n" + "st1b { z24.h }, p0, [x28, x14]\n" + "trn1 z26.h, z26.h, z23.h\n" + "st1b { z26.h }, p0, [x27, x14]\n" + "inch x14\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z17.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z17.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z17.s, z17.s, z16.s\n" + "mov z9.d, z11.d\n" + "ld1sb { z0.h }, p4/Z, [x16]\n" + ".inst 0x45521000 // ssublb z0.h, z0.b, z18.b\n" + "mov z20.d, z17.d\n" + "ld1sb { z1.h }, p4/Z, [x16, #1, MUL VL]\n" + "mov z24.d, z11.d\n" + "ld1sb { z2.h }, p4/Z, [x16, #2, MUL VL]\n" + ".inst 0x45521021 // ssublb z1.h, z1.b, z18.b\n" + "mov z19.d, z17.d\n" + "ld1sb { z3.h }, p4/Z, [x16, #3, MUL VL]\n" + "mov z26.d, z11.d\n" + "ld1sb { z4.h }, p4/Z, [x16, #4, MUL VL]\n" + ".inst 0x45521042 // ssublb z2.h, z2.b, z18.b\n" + "mov z23.d, z17.d\n" + "ld1sb { z5.h }, p4/Z, [x16, #5, MUL VL]\n" + ".inst 0x45521063 // ssublb z3.h, z3.b, z18.b\n" + "ld1sb { z6.h }, p4/Z, [x16, #6, MUL VL]\n" + "ld1sb { z7.h }, p4/Z, [x16, #7, MUL VL]\n" + ".inst 0x45521084 // ssublb z4.h, z4.b, z18.b\n" + "inch x16, ALL, MUL #8\n" + "ld1sb { z8.h }, p4/Z, [x16]\n" + "ldp x23, x22, [x12, #0x0]\n" + ".inst 0x455210a5 // ssublb z5.h, z5.b, z18.b\n" + ".inst 0x455210c6 // ssublb z6.h, z6.b, z18.b\n" + "ldp x21, x20, [x12, #0x10]\n" + ".inst 0x455210e7 // ssublb z7.h, z7.b, z18.b\n" + ".inst 0x45521108 // ssublb z8.h, z8.b, z18.b\n" + "ldr x19, [x12, #0x20]\n" + "ld1sb { z31.h }, p3/Z, [x23, x15]\n" + ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n" + "ld1sb { z30.h }, p3/Z, [x22, x15]\n" + "ld1sb { z29.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c13de // ssublb z30.h, z30.b, z12.b\n" + "ld1sb { z28.h }, p3/Z, [x20, x15]\n" + "ld1sb { z27.h }, p3/Z, [x19, x15]\n" + ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n" + ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n" + ".inst 0x454c137b // ssublb z27.h, z27.b, z12.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..89507ef9ea --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + +struct sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + + kern_type kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + + sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..b773ca1fe6 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + int8_t *const *const outptrs; + const int8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const int8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x6, [%x[params], %[offsetof_Params_weights]]\n" + "mov x7, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x8, #0x0\n" + "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x16, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z19.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z12.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z20.s }, p4/Z, [x20]\n" + "whilelt p3.h, x7, x5\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "whilelt p2.s, x7, x5\n" + "ldp x14, x13, [x21, #0x0]\n" + "mov x19, x7\n" + "incw x19\n" + "ldp x12, x11, [x21, #0x10]\n" + "whilelt p1.s, x19, x5\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z18.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z18.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z18.s, z16.s\n" + "mov z11.d, z13.d\n" + "ld1sb { z0.h }, p4/Z, [x6]\n" + ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n" + "mov z9.d, z16.d\n" + "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n" + "mov z18.d, z13.d\n" + "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n" + ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n" + "mov z10.d, z16.d\n" + "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n" + "mov z22.d, z13.d\n" + "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n" + ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n" + "mov z23.d, z16.d\n" + "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n" + ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n" + "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n" + "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n" + ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n" + "inch x6, ALL, MUL #8\n" + "ld1sb { z8.h }, p4/Z, [x6]\n" + "ldp x26, x25, [x16, #0x0]\n" + ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n" + ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n" + "ldp x24, x23, [x16, #0x10]\n" + ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n" + ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ld1sb { z31.h }, p3/Z, [x26, x7]\n" + ".inst 0x455313ff // ssublb z31.h, z31.b, z19.b\n" + "ld1sb { z30.h }, p3/Z, [x25, x7]\n" + "ld1sb { z29.h }, p3/Z, [x24, x7]\n" + ".inst 0x455313de // ssublb z30.h, z30.b, z19.b\n" + "ld1sb { z28.h }, p3/Z, [x23, x7]\n" + "ld1sb { z27.h }, p3/Z, [x22, x7]\n" + ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n" + "ld1sb { z26.h }, p3/Z, [x21, x7]\n" + ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n" + "ld1sb { z25.h }, p3/Z, [x20, x7]\n" + "ld1sb { z24.h }, p3/Z, [x19, x7]\n" + ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n" + ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n" + ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n" + ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n" + "1:" // Loop + ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n" + "ldr x23, [x16, #0x40]\n" + "whilelt p0.h, x8, x5\n" + ".inst 0x448847f0 // smlalt z16.s, p4/M, z31.h, z8.h\n" + "ldr x22, [x16, #0x48]\n" + "inch x6\n" + ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n" + "ldr x21, [x16, #0x50]\n" + ".inst 0x448647e9 // smlalt z9.s, p4/M, z31.h, z6.h\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x448243f2 // smlalb z18.s, p4/M, z31.h, z2.h\n" + "ldr x19, [x16, #0x60]\n" + ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n" + "ldr x10, [x16, #0x68]\n" + ".inst 0x448043f6 // smlalb z22.s, p4/M, z31.h, z0.h\n" + "ldr x9, [x16, #0x70]\n" + ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n" + "ldr x28, [x16, #0x78]\n" + ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n" + "ldr x27, [x16, #0x80]\n" + ".inst 0x448047d0 // smlalt z16.s, p4/M, z30.h, z0.h\n" + "ldr x26, [x16, #0x88]\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + "ldr x25, [x16, #0x90]\n" + ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n" + "ld1sb { z28.h }, p3/Z, [x22, x7]\n" + ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n" + ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n" + "ldr x24, [x16, #0x98]\n" + ".inst 0x448147b0 // smlalt z16.s, p4/M, z29.h, z1.h\n" + "ld1sb { z29.h }, p3/Z, [x23, x7]\n" + ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n" + ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n" + "ldr x23, [x16, #0xa0]\n" + ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n" + "ld1sb { z27.h }, p3/Z, [x21, x7]\n" + ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n" + ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n" + "ldr x22, [x16, #0xa8]\n" + ".inst 0x44834750 // smlalt z16.s, p4/M, z26.h, z3.h\n" + "ld1sb { z26.h }, p3/Z, [x20, x7]\n" + ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n" + ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n" + "ldr x21, [x16, #0xb0]\n" + ".inst 0x44844730 // smlalt z16.s, p4/M, z25.h, z4.h\n" + "ld1sb { z25.h }, p3/Z, [x19, x7]\n" + ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n" + ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n" + "ldr x19, [x16, #0xc0]\n" + ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n" + "ld1w { z21.s }, p2/Z, [x17]\n" + ".inst 0x44804709 // smlalt z9.s, p4/M, z24.h, z0.h\n" + "ld1sb { z24.h }, p3/Z, [x9, x7]\n" + ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n" + ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n" + "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n" + ".inst 0x448447a9 // smlalt z9.s, p4/M, z29.h, z4.h\n" + "ld1sb { z29.h }, p3/Z, [x10, x7]\n" + "addvl x17, x17, #2\n" + ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n" + ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n" + "uzp1 z30.s, z21.s, z17.s\n" + "uzp2 z31.s, z21.s, z17.s\n" + "ld1w { z21.s }, p2/Z, [x15]\n" + ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n" + "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n" + "addvl x15, x15, #2\n" + ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n" + "ld1sb { z28.h }, p3/Z, [x27, x7]\n" + ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n" + ".inst 0x44854770 // smlalt z16.s, p4/M, z27.h, z5.h\n" + ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n" + "ld1sb { z27.h }, p3/Z, [x28, x7]\n" + ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n" + ".inst 0x44834352 // smlalb z18.s, p4/M, z26.h, z3.h\n" + ".inst 0x4483474a // smlalt z10.s, p4/M, z26.h, z3.h\n" + "ld1sb { z26.h }, p3/Z, [x26, x7]\n" + ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n" + ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n" + ".inst 0x44864730 // smlalt z16.s, p4/M, z25.h, z6.h\n" + ".inst 0x44804332 // smlalb z18.s, p4/M, z25.h, z0.h\n" + ".inst 0x4480472a // smlalt z10.s, p4/M, z25.h, z0.h\n" + "ld1sb { z25.h }, p3/Z, [x25, x7]\n" + ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n" + "uzp1 z0.s, z21.s, z17.s\n" + "uzp2 z21.s, z21.s, z17.s\n" + ".inst 0x448443b2 // smlalb z18.s, p4/M, z29.h, z4.h\n" + ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n" + "ld1sb { z29.h }, p3/Z, [x24, x7]\n" + ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n" + ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n" + ".inst 0x44874710 // smlalt z16.s, p4/M, z24.h, z7.h\n" + ".inst 0x44814312 // smlalb z18.s, p4/M, z24.h, z1.h\n" + ".inst 0x4481470a // smlalt z10.s, p4/M, z24.h, z1.h\n" + "ld1sb { z24.h }, p3/Z, [x22, x7]\n" + ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n" + ".inst 0x04be75ad // sqrdmulh z13.s, z13.s, z30.s\n" + ".inst 0x04bf7610 // sqrdmulh z16.s, z16.s, z31.s\n" + ".inst 0x44844376 // smlalb z22.s, p4/M, z27.h, z4.h\n" + ".inst 0x44844777 // smlalt z23.s, p4/M, z27.h, z4.h\n" + "ld1sb { z27.h }, p3/Z, [x23, x7]\n" + ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n" + "and z4.d, z13.d, z0.d\n" + "and z17.d, z16.d, z21.d\n" + "asr z4.s, z4.s, #0x1f\n" + ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n" + ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n" + "asr z17.s, z17.s, #0x1f\n" + ".inst 0x44814396 // smlalb z22.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n" + ".inst 0x44864332 // smlalb z18.s, p4/M, z25.h, z6.h\n" + ".inst 0x4486472a // smlalt z10.s, p4/M, z25.h, z6.h\n" + "ld1sb { z25.h }, p3/Z, [x20, x7]\n" + ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n" + "sqadd z13.s, z13.s, z4.s\n" + "sqadd z16.s, z16.s, z17.s\n" + ".inst 0x44854356 // smlalb z22.s, p4/M, z26.h, z5.h\n" + ".inst 0x44854757 // smlalt z23.s, p4/M, z26.h, z5.h\n" + "ld1sb { z26.h }, p3/Z, [x21, x7]\n" + ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n" + ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n" + ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n" + ".inst 0x448243b6 // smlalb z22.s, p4/M, z29.h, z2.h\n" + ".inst 0x448247b7 // smlalt z23.s, p4/M, z29.h, z2.h\n" + "ld1sb { z29.h }, p3/Z, [x19, x7]\n" + "inch x7\n" + ".inst 0x04be756b // sqrdmulh z11.s, z11.s, z30.s\n" + "whilelt p2.s, x7, x5\n" + ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n" + "mov x19, x7\n" + ".inst 0x44874372 // smlalb z18.s, p4/M, z27.h, z7.h\n" + ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n" + ".inst 0x4487476a // smlalt z10.s, p4/M, z27.h, z7.h\n" + "incw x19\n" + ".inst 0x44834316 // smlalb z22.s, p4/M, z24.h, z3.h\n" + "whilelt p1.s, x19, x5\n" + "and z1.d, z11.d, z0.d\n" + "whilelt p3.h, x7, x5\n" + "and z17.d, z9.d, z21.d\n" + "asr z1.s, z1.s, #0x1f\n" + ".inst 0x44854312 // smlalb z18.s, p4/M, z24.h, z5.h\n" + ".inst 0x4485470a // smlalt z10.s, p4/M, z24.h, z5.h\n" + "asr z17.s, z17.s, #0x1f\n" + ".inst 0x44834717 // smlalt z23.s, p4/M, z24.h, z3.h\n" + ".inst 0x44874356 // smlalb z22.s, p4/M, z26.h, z7.h\n" + ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n" + ".inst 0x44884332 // smlalb z18.s, p4/M, z25.h, z8.h\n" + "sqadd z11.s, z11.s, z1.s\n" + "sqadd z9.s, z9.s, z17.s\n" + "add z13.s, z13.s, z14.s\n" + ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n" + ".inst 0x44874757 // smlalt z23.s, p4/M, z26.h, z7.h\n" + ".inst 0x4488472a // smlalt z10.s, p4/M, z25.h, z8.h\n" + ".inst 0x44864336 // smlalb z22.s, p4/M, z25.h, z6.h\n" + "and z17.d, z18.d, z0.d\n" + "asr z17.s, z17.s, #0x1f\n" + ".inst 0x04bf754a // sqrdmulh z10.s, z10.s, z31.s\n" + ".inst 0x44864737 // smlalt z23.s, p4/M, z25.h, z6.h\n" + ".inst 0x448843b6 // smlalb z22.s, p4/M, z29.h, z8.h\n" + "smin z13.s, p4/M, z13.s, z15.s\n" + ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n" + "and z1.d, z10.d, z21.d\n" + "asr z1.s, z1.s, #0x1f\n" + "add z16.s, z16.s, z14.s\n" + "sqadd z18.s, z18.s, z17.s\n" + ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n" + ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n" + "smax z13.s, p4/M, z13.s, z20.s\n" + "smin z16.s, p4/M, z16.s, z15.s\n" + "sqadd z10.s, z10.s, z1.s\n" + "and z2.d, z22.d, z0.d\n" + "asr z2.s, z2.s, #0x1f\n" + ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n" + "smax z16.s, p4/M, z16.s, z20.s\n" + ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n" + ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n" + ".inst 0x44829012 // srshl z18.s, p4/M, z18.s, z0.s\n" + "trn1 z13.h, z13.h, z16.h\n" + "st1b { z13.h }, p0, [x14, x8]\n" + "add z11.s, z11.s, z14.s\n" + "add z9.s, z9.s, z14.s\n" + "add z18.s, z18.s, z14.s\n" + "sqadd z22.s, z22.s, z2.s\n" + "and z16.d, z23.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "smin z11.s, p4/M, z11.s, z15.s\n" + "smin z9.s, p4/M, z9.s, z15.s\n" + "smin z18.s, p4/M, z18.s, z15.s\n" + ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n" + ".inst 0x44829016 // srshl z22.s, p4/M, z22.s, z0.s\n" + "smax z11.s, p4/M, z11.s, z20.s\n" + "sqadd z23.s, z23.s, z16.s\n" + "add z10.s, z10.s, z14.s\n" + "add z22.s, z22.s, z14.s\n" + "smax z9.s, p4/M, z9.s, z20.s\n" + "smax z18.s, p4/M, z18.s, z20.s\n" + "smin z10.s, p4/M, z10.s, z15.s\n" + "smin z22.s, p4/M, z22.s, z15.s\n" + "trn1 z11.h, z11.h, z9.h\n" + "st1b { z11.h }, p0, [x13, x8]\n" + "smax z10.s, p4/M, z10.s, z20.s\n" + ".inst 0x448292b7 // srshl z23.s, p4/M, z23.s, z21.s\n" + "smax z22.s, p4/M, z22.s, z20.s\n" + "trn1 z18.h, z18.h, z10.h\n" + "st1b { z18.h }, p0, [x12, x8]\n" + "add z23.s, z23.s, z14.s\n" + "smin z23.s, p4/M, z23.s, z15.s\n" + "smax z23.s, p4/M, z23.s, z20.s\n" + "trn1 z22.h, z22.h, z23.h\n" + "st1b { z22.h }, p0, [x11, x8]\n" + "inch x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z18.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z18.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z18.s, z16.s\n" + "mov z11.d, z13.d\n" + "ld1sb { z0.h }, p4/Z, [x6]\n" + ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n" + "mov z9.d, z16.d\n" + "ld1sb { z1.h }, p4/Z, [x6, #1, MUL VL]\n" + "mov z18.d, z13.d\n" + "ld1sb { z2.h }, p4/Z, [x6, #2, MUL VL]\n" + ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n" + "mov z10.d, z16.d\n" + "ld1sb { z3.h }, p4/Z, [x6, #3, MUL VL]\n" + "mov z22.d, z13.d\n" + "ld1sb { z4.h }, p4/Z, [x6, #4, MUL VL]\n" + ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n" + "mov z23.d, z16.d\n" + "ld1sb { z5.h }, p4/Z, [x6, #5, MUL VL]\n" + ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n" + "ld1sb { z6.h }, p4/Z, [x6, #6, MUL VL]\n" + "ld1sb { z7.h }, p4/Z, [x6, #7, MUL VL]\n" + ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n" + "inch x6, ALL, MUL #8\n" + "ld1sb { z8.h }, p4/Z, [x6]\n" + "ldp x26, x25, [x16, #0x0]\n" + ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n" + ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n" + "ldp x24, x23, [x16, #0x10]\n" + ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n" + ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ld1sb { z31.h }, p3/Z, [x26, x7]\n" + ".inst 0x455313ff // ssublb z31.h, z31.b, z19.b\n" + "ld1sb { z30.h }, p3/Z, [x25, x7]\n" + "ld1sb { z29.h }, p3/Z, [x24, x7]\n" + ".inst 0x455313de // ssublb z30.h, z30.b, z19.b\n" + "ld1sb { z28.h }, p3/Z, [x23, x7]\n" + "ld1sb { z27.h }, p3/Z, [x22, x7]\n" + ".inst 0x455313bd // ssublb z29.h, z29.b, z19.b\n" + "ld1sb { z26.h }, p3/Z, [x21, x7]\n" + ".inst 0x4553139c // ssublb z28.h, z28.b, z19.b\n" + "ld1sb { z25.h }, p3/Z, [x20, x7]\n" + "ld1sb { z24.h }, p3/Z, [x19, x7]\n" + ".inst 0x4553137b // ssublb z27.h, z27.b, z19.b\n" + ".inst 0x4553135a // ssublb z26.h, z26.b, z19.b\n" + ".inst 0x45531339 // ssublb z25.h, z25.b, z19.b\n" + ".inst 0x45531318 // ssublb z24.h, z24.b, z19.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..54ac1c2e0b --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + +struct sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size; + + kern_type kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + + sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..c02bb584e5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + int8_t *const *const outptrs; + const int8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const int8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + int8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x1, [%x[params], %[offsetof_Params_weights]]\n" + "mov x2, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x3, #0x0\n" + "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x5, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z17.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z13.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "whilelt p3.h, x2, x0\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "whilelt p2.s, x2, x0\n" + "ldp x7, x8, [x21, #0x0]\n" + "mov x19, x2\n" + "incw x19\n" + "ldp x17, x16, [x21, #0x10]\n" + "whilelt p1.s, x19, x0\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z19.s }, p2/Z, [x19]\n" + "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z19.s, z6.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z19.s, z6.s\n" + "mov z19.d, z11.d\n" + "ld1sb { z0.h }, p4/Z, [x1]\n" + ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n" + "mov z9.d, z16.d\n" + "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n" + "mov z7.d, z11.d\n" + "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n" + "mov z6.d, z16.d\n" + "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n" + "mov z12.d, z11.d\n" + "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n" + "mov z8.d, z16.d\n" + "ldp x28, x27, [x5, #0x0]\n" + ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n" + "ldp x26, x25, [x5, #0x10]\n" + ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n" + "ldp x24, x23, [x5, #0x20]\n" + "ldp x22, x21, [x5, #0x30]\n" + "ldp x20, x19, [x5, #0x40]\n" + "ld1sb { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n" + "ld1sb { z30.h }, p3/Z, [x27, x2]\n" + "ld1sb { z29.h }, p3/Z, [x26, x2]\n" + ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n" + "ld1sb { z28.h }, p3/Z, [x25, x2]\n" + "ld1sb { z27.h }, p3/Z, [x24, x2]\n" + ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n" + "ld1sb { z23.h }, p3/Z, [x23, x2]\n" + ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n" + "ld1sb { z25.h }, p3/Z, [x22, x2]\n" + "ld1sb { z24.h }, p3/Z, [x21, x2]\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + "ld1sb { z26.h }, p3/Z, [x20, x2]\n" + ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n" + "ld1sb { z22.h }, p3/Z, [x19, x2]\n" + ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n" + ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n" + ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n" + ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n" + "1:" // Loop + ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n" + "ldr x20, [x5, #0x50]\n" + "whilelt p0.h, x3, x0\n" + ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n" + "ldr x19, [x5, #0x58]\n" + ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n" + "ldr x25, [x5, #0x60]\n" + ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n" + "ld1sb { z31.h }, p3/Z, [x20, x2]\n" + ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n" + ".inst 0x448043a7 // smlalb z7.s, p4/M, z29.h, z0.h\n" + "ldr x24, [x5, #0x68]\n" + ".inst 0x448047a6 // smlalt z6.s, p4/M, z29.h, z0.h\n" + "ldr x23, [x5, #0x70]\n" + ".inst 0x4480438c // smlalb z12.s, p4/M, z28.h, z0.h\n" + "ldr x22, [x5, #0x78]\n" + ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n" + ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n" + "ldr x15, [x5, #0x80]\n" + ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n" + "ld1sb { z30.h }, p3/Z, [x19, x2]\n" + ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n" + ".inst 0x44814373 // smlalb z19.s, p4/M, z27.h, z1.h\n" + "ldr x21, [x5, #0x88]\n" + ".inst 0x44814769 // smlalt z9.s, p4/M, z27.h, z1.h\n" + "ldr x20, [x5, #0x90]\n" + ".inst 0x44814387 // smlalb z7.s, p4/M, z28.h, z1.h\n" + "ldr x19, [x5, #0x98]\n" + ".inst 0x44814786 // smlalt z6.s, p4/M, z28.h, z1.h\n" + "ldr x14, [x5, #0xa0]\n" + ".inst 0x448142ec // smlalb z12.s, p4/M, z23.h, z1.h\n" + "ldr x13, [x5, #0xa8]\n" + ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n" + ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n" + "ldr x12, [x5, #0xb0]\n" + ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n" + "ld1sb { z27.h }, p3/Z, [x25, x2]\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n" + "ldr x11, [x5, #0xb8]\n" + ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n" + "ldr x10, [x5, #0xc0]\n" + ".inst 0x448242e7 // smlalb z7.s, p4/M, z23.h, z2.h\n" + "ldr x9, [x5, #0xc8]\n" + ".inst 0x448246e6 // smlalt z6.s, p4/M, z23.h, z2.h\n" + "ldr x28, [x5, #0xd0]\n" + ".inst 0x448243ec // smlalb z12.s, p4/M, z31.h, z2.h\n" + "ldr x27, [x5, #0xd8]\n" + ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n" + "ldr x26, [x5, #0xe0]\n" + ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n" + ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n" + "ld1sb { z25.h }, p3/Z, [x24, x2]\n" + ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n" + "ldr x25, [x5, #0xe8]\n" + ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n" + ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n" + "ld1w { z18.s }, p2/Z, [x4]\n" + ".inst 0x448343e7 // smlalb z7.s, p4/M, z31.h, z3.h\n" + "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n" + "addvl x4, x4, #2\n" + ".inst 0x448347e6 // smlalt z6.s, p4/M, z31.h, z3.h\n" + ".inst 0x448343cc // smlalb z12.s, p4/M, z30.h, z3.h\n" + ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1]\n" + ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n" + "uzp1 z21.s, z18.s, z20.s\n" + "uzp2 z10.s, z18.s, z20.s\n" + "ld1w { z18.s }, p2/Z, [x6]\n" + ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n" + "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n" + "addvl x6, x6, #2\n" + ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n" + "ld1sb { z24.h }, p3/Z, [x23, x2]\n" + ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n" + ".inst 0x44844373 // smlalb z19.s, p4/M, z27.h, z4.h\n" + "ldr x24, [x5, #0xf0]\n" + ".inst 0x44844769 // smlalt z9.s, p4/M, z27.h, z4.h\n" + "ld1sb { z27.h }, p3/Z, [x22, x2]\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + ".inst 0x448443c7 // smlalb z7.s, p4/M, z30.h, z4.h\n" + "ldr x23, [x5, #0xf8]\n" + ".inst 0x448447c6 // smlalt z6.s, p4/M, z30.h, z4.h\n" + ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n" + ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n" + ".inst 0x448043ab // smlalb z11.s, p4/M, z29.h, z0.h\n" + ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n" + "uzp1 z29.s, z18.s, z20.s\n" + "uzp2 z20.s, z18.s, z20.s\n" + ".inst 0x44804393 // smlalb z19.s, p4/M, z28.h, z0.h\n" + ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n" + ".inst 0x448042c7 // smlalb z7.s, p4/M, z22.h, z0.h\n" + ".inst 0x448046c6 // smlalt z6.s, p4/M, z22.h, z0.h\n" + ".inst 0x4480432c // smlalb z12.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n" + "ld1sb { z28.h }, p3/Z, [x21, x2]\n" + ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n" + ".inst 0x448142f3 // smlalb z19.s, p4/M, z23.h, z1.h\n" + "ldr x22, [x5, #0x100]\n" + ".inst 0x448146e9 // smlalt z9.s, p4/M, z23.h, z1.h\n" + ".inst 0x44814327 // smlalb z7.s, p4/M, z25.h, z1.h\n" + ".inst 0x44814726 // smlalt z6.s, p4/M, z25.h, z1.h\n" + ".inst 0x4481430c // smlalb z12.s, p4/M, z24.h, z1.h\n" + ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n" + ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n" + ".inst 0x448242eb // smlalb z11.s, p4/M, z23.h, z2.h\n" + ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n" + "ld1sb { z23.h }, p3/Z, [x15, x2]\n" + ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n" + ".inst 0x448243f3 // smlalb z19.s, p4/M, z31.h, z2.h\n" + "ldr x21, [x5, #0x108]\n" + ".inst 0x448247e9 // smlalt z9.s, p4/M, z31.h, z2.h\n" + ".inst 0x44824307 // smlalb z7.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824706 // smlalt z6.s, p4/M, z24.h, z2.h\n" + ".inst 0x4482436c // smlalb z12.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n" + ".inst 0x448343eb // smlalb z11.s, p4/M, z31.h, z3.h\n" + ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n" + "ld1sb { z31.h }, p3/Z, [x20, x2]\n" + ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n" + ".inst 0x448343d3 // smlalb z19.s, p4/M, z30.h, z3.h\n" + "ldr x20, [x5, #0x110]\n" + ".inst 0x448347c9 // smlalt z9.s, p4/M, z30.h, z3.h\n" + ".inst 0x44834367 // smlalb z7.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834766 // smlalt z6.s, p4/M, z27.h, z3.h\n" + ".inst 0x448342ec // smlalb z12.s, p4/M, z23.h, z3.h\n" + ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n" + ".inst 0x448443cb // smlalb z11.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n" + "ld1sb { z30.h }, p3/Z, [x19, x2]\n" + ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n" + ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n" + "ldr x19, [x5, #0x118]\n" + ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n" + "ld1sb { z26.h }, p3/Z, [x14, x2]\n" + ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n" + ".inst 0x448442e7 // smlalb z7.s, p4/M, z23.h, z4.h\n" + ".inst 0x448446e6 // smlalt z6.s, p4/M, z23.h, z4.h\n" + ".inst 0x4484438c // smlalb z12.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n" + ".inst 0x448042cb // smlalb z11.s, p4/M, z22.h, z0.h\n" + ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n" + "ld1sb { z22.h }, p3/Z, [x11, x2]\n" + ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n" + ".inst 0x44804333 // smlalb z19.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804729 // smlalt z9.s, p4/M, z25.h, z0.h\n" + ".inst 0x448043e7 // smlalb z7.s, p4/M, z31.h, z0.h\n" + ".inst 0x448047e6 // smlalt z6.s, p4/M, z31.h, z0.h\n" + ".inst 0x448043cc // smlalb z12.s, p4/M, z30.h, z0.h\n" + ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4481432b // smlalb z11.s, p4/M, z25.h, z1.h\n" + ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n" + ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n" + "ld1sb { z25.h }, p3/Z, [x13, x2]\n" + ".inst 0x44814313 // smlalb z19.s, p4/M, z24.h, z1.h\n" + ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n" + ".inst 0x44814709 // smlalt z9.s, p4/M, z24.h, z1.h\n" + ".inst 0x448143c7 // smlalb z7.s, p4/M, z30.h, z1.h\n" + ".inst 0x448147c6 // smlalt z6.s, p4/M, z30.h, z1.h\n" + ".inst 0x4481434c // smlalb z12.s, p4/M, z26.h, z1.h\n" + ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1]\n" + ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n" + ".inst 0x4482430b // smlalb z11.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n" + "ld1sb { z24.h }, p3/Z, [x12, x2]\n" + ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n" + ".inst 0x44824373 // smlalb z19.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824347 // smlalb z7.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824746 // smlalt z6.s, p4/M, z26.h, z2.h\n" + ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n" + ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n" + ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n" + ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n" + "ld1sb { z27.h }, p3/Z, [x10, x2]\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + ".inst 0x448342f3 // smlalb z19.s, p4/M, z23.h, z3.h\n" + ".inst 0x448346e9 // smlalt z9.s, p4/M, z23.h, z3.h\n" + ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n" + ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n" + ".inst 0x448442eb // smlalb z11.s, p4/M, z23.h, z4.h\n" + ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n" + "ld1sb { z23.h }, p3/Z, [x9, x2]\n" + ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n" + ".inst 0x44844393 // smlalb z19.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n" + "ld1sb { z28.h }, p3/Z, [x26, x2]\n" + ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n" + ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n" + ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n" + ".inst 0x448442cc // smlalb z12.s, p4/M, z22.h, z4.h\n" + ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n" + ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n" + ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n" + ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n" + "ld1sb { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n" + ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n" + ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n" + ".inst 0x44804367 // smlalb z7.s, p4/M, z27.h, z0.h\n" + ".inst 0x44804766 // smlalt z6.s, p4/M, z27.h, z0.h\n" + ".inst 0x448042ec // smlalb z12.s, p4/M, z23.h, z0.h\n" + ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n" + ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n" + ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n" + "ld1sb { z30.h }, p3/Z, [x27, x2]\n" + ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n" + ".inst 0x44814353 // smlalb z19.s, p4/M, z26.h, z1.h\n" + ".inst 0x44814749 // smlalt z9.s, p4/M, z26.h, z1.h\n" + ".inst 0x448142e7 // smlalb z7.s, p4/M, z23.h, z1.h\n" + ".inst 0x448146e6 // smlalt z6.s, p4/M, z23.h, z1.h\n" + ".inst 0x448143ec // smlalb z12.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n" + ".inst 0x4482434b // smlalb z11.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n" + "ld1sb { z26.h }, p3/Z, [x25, x2]\n" + ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n" + ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n" + ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n" + ".inst 0x448243e7 // smlalb z7.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247e6 // smlalt z6.s, p4/M, z31.h, z2.h\n" + ".inst 0x448243cc // smlalb z12.s, p4/M, z30.h, z2.h\n" + ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n" + ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n" + "ld1sb { z25.h }, p3/Z, [x24, x2]\n" + ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n" + ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n" + ".inst 0x448343c7 // smlalb z7.s, p4/M, z30.h, z3.h\n" + ".inst 0x448347c6 // smlalt z6.s, p4/M, z30.h, z3.h\n" + ".inst 0x4483438c // smlalb z12.s, p4/M, z28.h, z3.h\n" + ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n" + ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n" + ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n" + "ld1sb { z24.h }, p3/Z, [x23, x2]\n" + ".inst 0x448442d3 // smlalb z19.s, p4/M, z22.h, z4.h\n" + ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n" + ".inst 0x448446c9 // smlalt z9.s, p4/M, z22.h, z4.h\n" + ".inst 0x44844387 // smlalb z7.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844786 // smlalt z6.s, p4/M, z28.h, z4.h\n" + ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1]\n" + "inch x1\n" + ".inst 0x4480436b // smlalb z11.s, p4/M, z27.h, z0.h\n" + ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n" + ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n" + "ld1sb { z27.h }, p3/Z, [x22, x2]\n" + ".inst 0x448042f3 // smlalb z19.s, p4/M, z23.h, z0.h\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + ".inst 0x448046e9 // smlalt z9.s, p4/M, z23.h, z0.h\n" + ".inst 0x44804327 // smlalb z7.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804726 // smlalt z6.s, p4/M, z25.h, z0.h\n" + "ld1sb { z25.h }, p3/Z, [x21, x2]\n" + ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n" + ".inst 0x4480430c // smlalb z12.s, p4/M, z24.h, z0.h\n" + ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n" + ".inst 0x448142eb // smlalb z11.s, p4/M, z23.h, z1.h\n" + ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n" + ".inst 0x448143f3 // smlalb z19.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n" + ".inst 0x44814307 // smlalb z7.s, p4/M, z24.h, z1.h\n" + ".inst 0x44814706 // smlalt z6.s, p4/M, z24.h, z1.h\n" + "ld1sb { z24.h }, p3/Z, [x20, x2]\n" + ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n" + ".inst 0x4481436c // smlalb z12.s, p4/M, z27.h, z1.h\n" + ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n" + ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n" + ".inst 0x448243d3 // smlalb z19.s, p4/M, z30.h, z2.h\n" + ".inst 0x448247c9 // smlalt z9.s, p4/M, z30.h, z2.h\n" + ".inst 0x44824367 // smlalb z7.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824766 // smlalt z6.s, p4/M, z27.h, z2.h\n" + "ld1sb { z27.h }, p3/Z, [x19, x2]\n" + "inch x2\n" + ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n" + "whilelt p2.s, x2, x0\n" + ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n" + "mov x19, x2\n" + ".inst 0x448343cb // smlalb z11.s, p4/M, z30.h, z3.h\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n" + "incw x19\n" + ".inst 0x44834393 // smlalb z19.s, p4/M, z28.h, z3.h\n" + "whilelt p1.s, x19, x0\n" + ".inst 0x44834789 // smlalt z9.s, p4/M, z28.h, z3.h\n" + "whilelt p3.h, x2, x0\n" + ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n" + ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n" + ".inst 0x4484438b // smlalb z11.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n" + ".inst 0x04b5756b // sqrdmulh z11.s, z11.s, z21.s\n" + ".inst 0x04aa7610 // sqrdmulh z16.s, z16.s, z10.s\n" + ".inst 0x04b57673 // sqrdmulh z19.s, z19.s, z21.s\n" + ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n" + "and z31.d, z11.d, z29.d\n" + "asr z31.s, z31.s, #0x1f\n" + "and z23.d, z16.d, z20.d\n" + "and z25.d, z19.d, z29.d\n" + "asr z23.s, z23.s, #0x1f\n" + "and z18.d, z9.d, z20.d\n" + ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n" + "asr z25.s, z25.s, #0x1f\n" + ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n" + "asr z18.s, z18.s, #0x1f\n" + "sqadd z11.s, z11.s, z31.s\n" + ".inst 0x4484436c // smlalb z12.s, p4/M, z27.h, z4.h\n" + ".inst 0x04b574e7 // sqrdmulh z7.s, z7.s, z21.s\n" + "sqadd z16.s, z16.s, z23.s\n" + "sqadd z19.s, z19.s, z25.s\n" + ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n" + "sqadd z9.s, z9.s, z18.s\n" + "and z1.d, z7.d, z29.d\n" + "asr z1.s, z1.s, #0x1f\n" + "and z18.d, z6.d, z20.d\n" + ".inst 0x04b5758c // sqrdmulh z12.s, z12.s, z21.s\n" + "asr z18.s, z18.s, #0x1f\n" + ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n" + ".inst 0x448293ab // srshl z11.s, p4/M, z11.s, z29.s\n" + "and z30.d, z12.d, z29.d\n" + "asr z30.s, z30.s, #0x1f\n" + "add z11.s, z11.s, z14.s\n" + "sqadd z7.s, z7.s, z1.s\n" + "sqadd z6.s, z6.s, z18.s\n" + ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n" + "smin z11.s, p4/M, z11.s, z15.s\n" + ".inst 0x44829290 // srshl z16.s, p4/M, z16.s, z20.s\n" + "sqadd z12.s, z12.s, z30.s\n" + "and z3.d, z8.d, z20.d\n" + "asr z3.s, z3.s, #0x1f\n" + "add z16.s, z16.s, z14.s\n" + "smax z11.s, p4/M, z11.s, z5.s\n" + ".inst 0x448293b3 // srshl z19.s, p4/M, z19.s, z29.s\n" + ".inst 0x44829289 // srshl z9.s, p4/M, z9.s, z20.s\n" + "smin z16.s, p4/M, z16.s, z15.s\n" + ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n" + "add z19.s, z19.s, z14.s\n" + "add z9.s, z9.s, z14.s\n" + "sqadd z8.s, z8.s, z3.s\n" + "add z7.s, z7.s, z14.s\n" + "smax z16.s, p4/M, z16.s, z5.s\n" + "smin z19.s, p4/M, z19.s, z15.s\n" + "smin z9.s, p4/M, z9.s, z15.s\n" + "smin z7.s, p4/M, z7.s, z15.s\n" + "trn1 z11.h, z11.h, z16.h\n" + "st1b { z11.h }, p0, [x7, x3]\n" + "smax z19.s, p4/M, z19.s, z5.s\n" + "smax z9.s, p4/M, z9.s, z5.s\n" + "smax z7.s, p4/M, z7.s, z5.s\n" + ".inst 0x44829286 // srshl z6.s, p4/M, z6.s, z20.s\n" + ".inst 0x448293ac // srshl z12.s, p4/M, z12.s, z29.s\n" + "trn1 z19.h, z19.h, z9.h\n" + "st1b { z19.h }, p0, [x8, x3]\n" + "add z6.s, z6.s, z14.s\n" + ".inst 0x44829288 // srshl z8.s, p4/M, z8.s, z20.s\n" + "add z12.s, z12.s, z14.s\n" + "smin z6.s, p4/M, z6.s, z15.s\n" + "add z8.s, z8.s, z14.s\n" + "smin z12.s, p4/M, z12.s, z15.s\n" + "smax z6.s, p4/M, z6.s, z5.s\n" + "smin z8.s, p4/M, z8.s, z15.s\n" + "smax z12.s, p4/M, z12.s, z5.s\n" + "trn1 z7.h, z7.h, z6.h\n" + "st1b { z7.h }, p0, [x17, x3]\n" + "smax z8.s, p4/M, z8.s, z5.s\n" + "trn1 z12.h, z12.h, z8.h\n" + "st1b { z12.h }, p0, [x16, x3]\n" + "inch x3\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z19.s }, p2/Z, [x19]\n" + "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z19.s, z6.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z19.s, z6.s\n" + "mov z19.d, z11.d\n" + "ld1sb { z0.h }, p4/Z, [x1]\n" + ".inst 0x454d1000 // ssublb z0.h, z0.b, z13.b\n" + "mov z9.d, z16.d\n" + "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n" + "mov z7.d, z11.d\n" + "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1021 // ssublb z1.h, z1.b, z13.b\n" + "mov z6.d, z16.d\n" + "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n" + "mov z12.d, z11.d\n" + "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n" + "mov z8.d, z16.d\n" + "ldp x28, x27, [x5, #0x0]\n" + ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n" + "ldp x26, x25, [x5, #0x10]\n" + ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n" + "ldp x24, x23, [x5, #0x20]\n" + "ldp x22, x21, [x5, #0x30]\n" + "ldp x20, x19, [x5, #0x40]\n" + "ld1sb { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x455113ff // ssublb z31.h, z31.b, z17.b\n" + "ld1sb { z30.h }, p3/Z, [x27, x2]\n" + "ld1sb { z29.h }, p3/Z, [x26, x2]\n" + ".inst 0x455113de // ssublb z30.h, z30.b, z17.b\n" + "ld1sb { z28.h }, p3/Z, [x25, x2]\n" + "ld1sb { z27.h }, p3/Z, [x24, x2]\n" + ".inst 0x455113bd // ssublb z29.h, z29.b, z17.b\n" + "ld1sb { z23.h }, p3/Z, [x23, x2]\n" + ".inst 0x4551139c // ssublb z28.h, z28.b, z17.b\n" + "ld1sb { z25.h }, p3/Z, [x22, x2]\n" + "ld1sb { z24.h }, p3/Z, [x21, x2]\n" + ".inst 0x4551137b // ssublb z27.h, z27.b, z17.b\n" + "ld1sb { z26.h }, p3/Z, [x20, x2]\n" + ".inst 0x455112f7 // ssublb z23.h, z23.b, z17.b\n" + "ld1sb { z22.h }, p3/Z, [x19, x2]\n" + ".inst 0x45511339 // ssublb z25.h, z25.b, z17.b\n" + ".inst 0x45511318 // ssublb z24.h, z24.b, z17.b\n" + ".inst 0x4551135a // ssublb z26.h, z26.b, z17.b\n" + ".inst 0x455112d6 // ssublb z22.h, z22.b, z17.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp new file mode 100644 index 0000000000..7ab83e8659 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 9; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + + sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..f531912e72 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl( + const int8_t *const *const inptrs, + int8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov z31.s, #0x0\n" + "ldr x24, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "mov z18.s, #0x0\n" + "ldr x23, [%x[inptrs], #0x8]\n" + "lsl x9, %x[n_channels], #0x2\n" + "mov z29.s, #0x0\n" + "ldr x22, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "mov z28.s, #0x0\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "mov x19, #0x9\n" + "mov z13.s, #0x0\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "whilelt p1.b, XZR, x19\n" + "mov z14.s, #0x0\n" + "ld1b { z7.b }, p1/Z, [x24]\n" + "mov x19, #0x3\n" + "mov z15.s, #0x0\n" + "ld1b { z3.b }, p1/Z, [x23]\n" + "whilelt p0.b, XZR, x19\n" + "mov z11.b, p0/z, #0x1\n" + "ld1b { z4.b }, p1/Z, [x22]\n" + "mov x28, #0x0\n" + "mov z10.d, z7.d\n" + "ld1b { z6.b }, p1/Z, [x21]\n" + "mov x27, #0x0\n" + "ext z10.b, z10.b, z10.b, #0x2\n" + "ld1b { z5.b }, p1/Z, [x20]\n" + "whilelt p1.b, x28, x9\n" + "mov z17.d, z7.d\n" + "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n" + "mov z26.d, z7.d\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "ext z26.b, z26.b, z26.b, #0x6\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "mov z19.d, z3.d\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "ext z19.b, z19.b, z19.b, #0x2\n" + "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "zip1 z7.s, z7.s, z17.s\n" + "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "zip1 z10.s, z10.s, z26.s\n" + "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "zip1 z7.s, z7.s, z10.s\n" + "ld1w { z1.s }, p1/Z, [%x[params]]\n" + "mov z7.q, z7.q[0]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n" + "mov z17.d, z3.d\n" + "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n" + "addvl %x[params], %x[params], #4\n" + "mov z2.d, z3.d\n" + "mov z20.d, z4.d\n" + "ext z2.b, z2.b, z2.b, #0x6\n" + "zip1 z3.s, z3.s, z17.s\n" + "ext z20.b, z20.b, z20.b, #0x2\n" + "mov z17.d, z4.d\n" + "zip1 z19.s, z19.s, z2.s\n" + "zip1 z3.s, z3.s, z19.s\n" + "mov z3.q, z3.q[0]\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "mov z26.d, z4.d\n" + "ext z26.b, z26.b, z26.b, #0x6\n" + "mov z21.d, z6.d\n" + "zip1 z4.s, z4.s, z17.s\n" + "ext z21.b, z21.b, z21.b, #0x2\n" + "zip1 z20.s, z20.s, z26.s\n" + "zip1 z4.s, z4.s, z20.s\n" + "mov z4.q, z4.q[0]\n" + "mov z17.d, z6.d\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "mov z20.d, z6.d\n" + "ext z20.b, z20.b, z20.b, #0x6\n" + "mov z19.d, z5.d\n" + "zip1 z6.s, z6.s, z17.s\n" + "ext z19.b, z19.b, z19.b, #0x2\n" + "zip1 z21.s, z21.s, z20.s\n" + "zip1 z6.s, z6.s, z21.s\n" + "mov z6.q, z6.q[0]\n" + "mov z17.d, z5.d\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "mov z20.d, z5.d\n" + "ext z20.b, z20.b, z20.b, #0x6\n" + "mov z11.s, z11.s[0]\n" + "zip1 z5.s, z5.s, z17.s\n" + "mov z25.s, #0x0\n" + "zip1 z19.s, z19.s, z20.s\n" + "zip1 z5.s, z5.s, z19.s\n" + "mov z5.q, z5.q[0]\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z2.s, #0x0\n" + "mov z19.s, #0x0\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "sdot z18.s, z11.b, z7.b[1]\n" + "sdot z29.s, z11.b, z7.b[2]\n" + "sdot z28.s, z11.b, z7.b[3]\n" + "sdot z13.s, z11.b, z3.b[0]\n" + "sdot z14.s, z11.b, z3.b[1]\n" + "sdot z15.s, z11.b, z3.b[2]\n" + "sdot z25.s, z11.b, z3.b[3]\n" + "sdot z26.s, z11.b, z4.b[0]\n" + "sdot z27.s, z11.b, z4.b[1]\n" + "sdot z24.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z4.b[3]\n" + "sdot z22.s, z11.b, z6.b[0]\n" + "sdot z21.s, z11.b, z6.b[1]\n" + "sdot z17.s, z11.b, z6.b[2]\n" + "sdot z20.s, z11.b, z6.b[3]\n" + "sdot z2.s, z11.b, z5.b[0]\n" + "sdot z19.s, z11.b, z5.b[1]\n" + "mov z31.d, z31.d\n" + "mov z18.d, z18.d\n" + "mov z29.d, z29.d\n" + "mov z28.d, z28.d\n" + "add z31.s, z31.s, z13.s\n" + "mov z13.s, #0x0\n" + "sdot z13.s, z11.b, z5.b[2]\n" + "add z18.s, z18.s, z14.s\n" + "mov z14.s, #0x0\n" + "sdot z14.s, z11.b, z5.b[3]\n" + "add z29.s, z29.s, z15.s\n" + "add z28.s, z28.s, z25.s\n" + "add z31.s, z31.s, z26.s\n" + "add z18.s, z18.s, z27.s\n" + "add z29.s, z29.s, z24.s\n" + "add z28.s, z28.s, z23.s\n" + "mov z26.d, z26.d\n" + "mov z25.d, z27.d\n" + "mov z24.d, z24.d\n" + "mov z23.d, z23.d\n" + "add z26.s, z26.s, z22.s\n" + "add z25.s, z25.s, z21.s\n" + "add z24.s, z24.s, z17.s\n" + "add z23.s, z23.s, z20.s\n" + "add z26.s, z26.s, z2.s\n" + "add z25.s, z25.s, z19.s\n" + "add z24.s, z24.s, z13.s\n" + "add z23.s, z23.s, z14.s\n" + "neg z30.s, p2/M, z30.s\n" + "mul z31.s, p2/M, z31.s, z30.s\n" + "st1w { z31.s }, p2, [SP]\n" + "add z31.s, z31.s, z1.s\n" + "mul z18.s, p2/M, z18.s, z30.s\n" + "st1w { z18.s }, p2, [SP, #1, MUL VL]\n" + "add z18.s, z18.s, z1.s\n" + "mul z29.s, p2/M, z29.s, z30.s\n" + "st1w { z29.s }, p2, [SP, #2, MUL VL]\n" + "add z29.s, z29.s, z1.s\n" + "mul z28.s, p2/M, z28.s, z30.s\n" + "st1w { z28.s }, p2, [SP, #3, MUL VL]\n" + "add z28.s, z28.s, z1.s\n" + "mul z26.s, p2/M, z26.s, z30.s\n" + "st1w { z26.s }, p2, [SP, #4, MUL VL]\n" + "add z26.s, z26.s, z1.s\n" + "mul z25.s, p2/M, z25.s, z30.s\n" + "st1w { z25.s }, p2, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z1.s\n" + "mul z24.s, p2/M, z24.s, z30.s\n" + "st1w { z24.s }, p2, [SP, #6, MUL VL]\n" + "add z24.s, z24.s, z1.s\n" + "mul z23.s, p2/M, z23.s, z30.s\n" + "st1w { z23.s }, p2, [SP, #7, MUL VL]\n" + "add z23.s, z23.s, z1.s\n" + "1:" // Loop + "sdot z31.s, z8.b, z7.b[0]\n" + "ld1w { z22.s }, p2/Z, [%x[params]]\n" + "incb x28\n" + "sdot z18.s, z8.b, z7.b[1]\n" + "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "whilelt p0.s, x27, %x[n_channels]\n" + "sdot z29.s, z8.b, z7.b[2]\n" + "whilelt p1.b, x28, x9\n" + "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n" + "sdot z28.s, z8.b, z7.b[3]\n" + "sdot z26.s, z8.b, z4.b[0]\n" + "sdot z25.s, z8.b, z4.b[1]\n" + "sdot z24.s, z8.b, z4.b[2]\n" + "sdot z23.s, z8.b, z4.b[3]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n" + "sdot z31.s, z9.b, z3.b[0]\n" + "sdot z18.s, z9.b, z3.b[1]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "sdot z28.s, z9.b, z3.b[3]\n" + "sdot z26.s, z9.b, z6.b[0]\n" + "sdot z25.s, z9.b, z6.b[1]\n" + "sdot z24.s, z9.b, z6.b[2]\n" + "sdot z23.s, z9.b, z6.b[3]\n" + "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n" + "sdot z31.s, z10.b, z4.b[0]\n" + "sdot z18.s, z10.b, z4.b[1]\n" + "sdot z29.s, z10.b, z4.b[2]\n" + "sdot z28.s, z10.b, z4.b[3]\n" + "sdot z26.s, z10.b, z5.b[0]\n" + "sdot z25.s, z10.b, z5.b[1]\n" + "sdot z24.s, z10.b, z5.b[2]\n" + "sdot z23.s, z10.b, z5.b[3]\n" + "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n" + "addvl %x[params], %x[params], #6\n" + ".inst 0x04b677ff // sqrdmulh z31.s, z31.s, z22.s\n" + ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n" + ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n" + ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n" + ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n" + "and z20.d, z31.d, z21.d\n" + "asr z20.s, z20.s, #0x1f\n" + "and z19.d, z18.d, z21.d\n" + "and z14.d, z29.d, z21.d\n" + "asr z19.s, z19.s, #0x1f\n" + "and z17.d, z28.d, z21.d\n" + "and z2.d, z26.d, z21.d\n" + "asr z14.s, z14.s, #0x1f\n" + ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z31.s, z31.s, z20.s\n" + ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n" + "asr z2.s, z2.s, #0x1f\n" + ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n" + "sqadd z18.s, z18.s, z19.s\n" + "sqadd z29.s, z29.s, z14.s\n" + "and z27.d, z25.d, z21.d\n" + "asr z27.s, z27.s, #0x1f\n" + "sqadd z28.s, z28.s, z17.s\n" + "sqadd z26.s, z26.s, z2.s\n" + "and z17.d, z24.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "and z15.d, z23.d, z21.d\n" + ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n" + "asr z15.s, z15.s, #0x1f\n" + "sqadd z25.s, z25.s, z27.s\n" + ".inst 0x44828ab2 // srshl z18.s, p2/M, z18.s, z21.s\n" + "add z31.s, z31.s, z12.s\n" + "sqadd z24.s, z24.s, z17.s\n" + ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n" + "add z18.s, z18.s, z12.s\n" + "sqadd z23.s, z23.s, z15.s\n" + "smin z31.s, p2/M, z31.s, z0.s\n" + "add z29.s, z29.s, z12.s\n" + "smin z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n" + "smax z31.s, p2/M, z31.s, z16.s\n" + "st1b { z31.s }, p0, [x26, x27]\n" + "add z28.s, z28.s, z12.s\n" + "smax z18.s, p2/M, z18.s, z16.s\n" + "ld1w { z31.s }, p2/Z, [SP]\n" + "smin z29.s, p2/M, z29.s, z0.s\n" + "st1b { z18.s }, p0, [x25, x27]\n" + "add z31.s, z31.s, z1.s\n" + "smin z28.s, p2/M, z28.s, z0.s\n" + "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n" + "smax z29.s, p2/M, z29.s, z16.s\n" + "st1b { z29.s }, p0, [x24, x27]\n" + "add z18.s, z18.s, z1.s\n" + "smax z28.s, p2/M, z28.s, z16.s\n" + "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n" + "st1b { z28.s }, p0, [x23, x27]\n" + "add z29.s, z29.s, z1.s\n" + ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n" + "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n" + "add z26.s, z26.s, z12.s\n" + ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n" + ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n" + "add z25.s, z25.s, z12.s\n" + "add z28.s, z28.s, z1.s\n" + "add z24.s, z24.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "smin z26.s, p2/M, z26.s, z0.s\n" + "smin z25.s, p2/M, z25.s, z0.s\n" + "smin z24.s, p2/M, z24.s, z0.s\n" + "smin z23.s, p2/M, z23.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z16.s\n" + "st1b { z26.s }, p0, [x22, x27]\n" + "smax z25.s, p2/M, z25.s, z16.s\n" + "smax z24.s, p2/M, z24.s, z16.s\n" + "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n" + "smax z23.s, p2/M, z23.s, z16.s\n" + "st1b { z25.s }, p0, [x21, x27]\n" + "add z26.s, z26.s, z1.s\n" + "st1b { z24.s }, p0, [x20, x27]\n" + "st1b { z23.s }, p0, [x19, x27]\n" + "incw x27\n" + "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z1.s\n" + "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n" + "add z24.s, z24.s, z1.s\n" + "add z23.s, z23.s, z1.s\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..2c33bdcd3a --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 8; + constexpr static unsigned int input_cols = 6; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + + sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..ffa2c6a7bc --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl( + const int8_t *const *const inptrs, + int8_t *const *const outptrs, + const void *params, + const unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov z20.b, #0x1\n" + "ldr x24, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "mov z22.s, #0x1\n" + "ldr x23, [%x[inptrs], #0x8]\n" + "lsl x9, %x[n_channels], #0x2\n" + "mov z30.s, #0x0\n" + "ldr x22, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "mov z28.s, #0x0\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "mov x20, #0x6\n" + "mov z29.s, #0x0\n" + "ldr x19, [%x[inptrs], #0x20]\n" + "whilelt p0.b, XZR, x20\n" + "mov z27.s, #0x0\n" + "ld1b { z0.b }, p0/Z, [x24]\n" + "mov x28, #0x0\n" + "mov z26.s, #0x0\n" + "ld1b { z3.b }, p0/Z, [x23]\n" + "mov x27, #0x0\n" + "mov z25.s, #0x0\n" + "ld1b { z5.b }, p0/Z, [x22]\n" + "whilelt p1.b, x28, x9\n" + "mov z15.d, z0.d\n" + "ld1b { z4.b }, p0/Z, [x21]\n" + "mov z24.s, #0x0\n" + "ld1b { z6.b }, p0/Z, [x19]\n" + "ext z15.b, z15.b, z15.b, #0x1\n" + "ldr x21, [%x[inptrs], #0x28]\n" + "mov z16.d, z3.d\n" + "ldr x20, [%x[inptrs], #0x30]\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "ldr x19, [%x[inptrs], #0x38]\n" + "mov z18.d, z5.d\n" + "ld1b { z7.b }, p0/Z, [x21]\n" + "zip1 z0.d, z0.d, z15.d\n" + "ld1b { z1.b }, p0/Z, [x20]\n" + "mov z0.q, z0.q[0]\n" + "ld1b { z2.b }, p0/Z, [x19]\n" + "zip1 z3.d, z3.d, z16.d\n" + "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n" + "mov z3.q, z3.q[0]\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "ext z18.b, z18.b, z18.b, #0x1\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "mov z16.d, z4.d\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "mov z17.d, z6.d\n" + "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "zip1 z5.d, z5.d, z18.d\n" + "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "mov z5.q, z5.q[0]\n" + "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "zip1 z4.d, z4.d, z16.d\n" + "ld1w { z13.s }, p1/Z, [%x[params]]\n" + "mov z4.q, z4.q[0]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n" + "ext z17.b, z17.b, z17.b, #0x1\n" + "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n" + "mov z16.d, z7.d\n" + "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n" + "addvl %x[params], %x[params], #5\n" + "zip1 z6.d, z6.d, z17.d\n" + "mov z17.d, z1.d\n" + "mov z6.q, z6.q[0]\n" + "zip1 z7.d, z7.d, z16.d\n" + "mov z7.q, z7.q[0]\n" + "ext z17.b, z17.b, z17.b, #0x1\n" + "mov z16.d, z2.d\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "mov z23.s, #0x0\n" + "zip1 z1.d, z1.d, z17.d\n" + "mov z1.q, z1.q[0]\n" + "zip1 z2.d, z2.d, z16.d\n" + "mov z2.q, z2.q[0]\n" + "mov z18.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z19.s, #0x0\n" + "sdot z30.s, z20.b, z0.b[0]\n" + "sdot z28.s, z20.b, z0.b[2]\n" + "sdot z29.s, z20.b, z3.b[0]\n" + "sdot z27.s, z20.b, z3.b[2]\n" + "sdot z30.s, z22.b, z0.b[1]\n" + "sdot z28.s, z22.b, z0.b[3]\n" + "sdot z29.s, z22.b, z3.b[1]\n" + "sdot z27.s, z22.b, z3.b[3]\n" + "sdot z26.s, z20.b, z5.b[0]\n" + "sdot z25.s, z20.b, z5.b[2]\n" + "sdot z24.s, z20.b, z4.b[0]\n" + "sdot z23.s, z20.b, z4.b[2]\n" + "sdot z26.s, z22.b, z5.b[1]\n" + "sdot z25.s, z22.b, z5.b[3]\n" + "sdot z24.s, z22.b, z4.b[1]\n" + "sdot z23.s, z22.b, z4.b[3]\n" + "sdot z18.s, z20.b, z6.b[0]\n" + "sdot z17.s, z20.b, z6.b[2]\n" + "sdot z16.s, z20.b, z7.b[0]\n" + "sdot z21.s, z20.b, z7.b[2]\n" + "sdot z18.s, z22.b, z6.b[1]\n" + "sdot z17.s, z22.b, z6.b[3]\n" + "sdot z16.s, z22.b, z7.b[1]\n" + "sdot z21.s, z22.b, z7.b[3]\n" + "sdot z19.s, z20.b, z1.b[0]\n" + "mov z30.d, z30.d\n" + "mov z28.d, z28.d\n" + "add z30.s, z30.s, z29.s\n" + "sdot z19.s, z22.b, z1.b[1]\n" + "add z28.s, z28.s, z27.s\n" + "add z30.s, z30.s, z26.s\n" + "mov z29.d, z29.d\n" + "add z28.s, z28.s, z25.s\n" + "add z30.s, z30.s, z24.s\n" + "mov z27.d, z27.d\n" + "add z28.s, z28.s, z23.s\n" + "add z30.s, z30.s, z18.s\n" + "add z29.s, z29.s, z26.s\n" + "add z28.s, z28.s, z17.s\n" + "add z27.s, z27.s, z25.s\n" + "add z29.s, z29.s, z24.s\n" + "mov z26.d, z26.d\n" + "add z27.s, z27.s, z23.s\n" + "add z29.s, z29.s, z18.s\n" + "mov z25.d, z25.d\n" + "add z27.s, z27.s, z17.s\n" + "add z29.s, z29.s, z16.s\n" + "add z26.s, z26.s, z24.s\n" + "add z27.s, z27.s, z21.s\n" + "add z25.s, z25.s, z23.s\n" + "add z26.s, z26.s, z18.s\n" + "mov z24.d, z24.d\n" + "add z25.s, z25.s, z17.s\n" + "add z26.s, z26.s, z16.s\n" + "mov z23.d, z23.d\n" + "add z25.s, z25.s, z21.s\n" + "add z26.s, z26.s, z19.s\n" + "add z24.s, z24.s, z18.s\n" + "mov z18.s, #0x0\n" + "sdot z18.s, z20.b, z1.b[2]\n" + "add z23.s, z23.s, z17.s\n" + "mov z17.s, #0x0\n" + "sdot z17.s, z20.b, z2.b[0]\n" + "sdot z18.s, z22.b, z1.b[3]\n" + "add z24.s, z24.s, z16.s\n" + "mov z16.s, #0x0\n" + "sdot z17.s, z22.b, z2.b[1]\n" + "sdot z16.s, z20.b, z2.b[2]\n" + "add z25.s, z25.s, z18.s\n" + "add z23.s, z23.s, z21.s\n" + "add z24.s, z24.s, z19.s\n" + "sdot z16.s, z22.b, z2.b[3]\n" + "add z23.s, z23.s, z18.s\n" + "add z24.s, z24.s, z17.s\n" + "neg z15.s, p2/M, z15.s\n" + "add z23.s, z23.s, z16.s\n" + "mul z30.s, p2/M, z30.s, z15.s\n" + "st1w { z30.s }, p2, [SP]\n" + "add z30.s, z30.s, z13.s\n" + "mul z28.s, p2/M, z28.s, z15.s\n" + "st1w { z28.s }, p2, [SP, #1, MUL VL]\n" + "add z28.s, z28.s, z13.s\n" + "mul z29.s, p2/M, z29.s, z15.s\n" + "st1w { z29.s }, p2, [SP, #2, MUL VL]\n" + "add z29.s, z29.s, z13.s\n" + "mul z27.s, p2/M, z27.s, z15.s\n" + "st1w { z27.s }, p2, [SP, #3, MUL VL]\n" + "add z27.s, z27.s, z13.s\n" + "mul z26.s, p2/M, z26.s, z15.s\n" + "st1w { z26.s }, p2, [SP, #4, MUL VL]\n" + "add z26.s, z26.s, z13.s\n" + "mul z25.s, p2/M, z25.s, z15.s\n" + "st1w { z25.s }, p2, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z13.s\n" + "mul z24.s, p2/M, z24.s, z15.s\n" + "st1w { z24.s }, p2, [SP, #6, MUL VL]\n" + "add z24.s, z24.s, z13.s\n" + "mul z23.s, p2/M, z23.s, z15.s\n" + "st1w { z23.s }, p2, [SP, #7, MUL VL]\n" + "add z23.s, z23.s, z13.s\n" + "1:" // Loop + "sdot z30.s, z8.b, z0.b[0]\n" + "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "incb x28\n" + "sdot z28.s, z8.b, z0.b[2]\n" + "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "whilelt p0.s, x27, %x[n_channels]\n" + "sdot z29.s, z8.b, z3.b[0]\n" + "whilelt p1.b, x28, x9\n" + "sdot z27.s, z8.b, z3.b[2]\n" + "sdot z26.s, z8.b, z5.b[0]\n" + "sdot z25.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z4.b[0]\n" + "sdot z23.s, z8.b, z4.b[2]\n" + "ld1b { z8.b }, p2/Z, [%x[params]]\n" + "sdot z30.s, z9.b, z0.b[1]\n" + "sdot z28.s, z9.b, z0.b[3]\n" + "sdot z29.s, z9.b, z3.b[1]\n" + "sdot z27.s, z9.b, z3.b[3]\n" + "sdot z26.s, z9.b, z5.b[1]\n" + "sdot z25.s, z9.b, z5.b[3]\n" + "sdot z24.s, z9.b, z4.b[1]\n" + "sdot z23.s, z9.b, z4.b[3]\n" + "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "sdot z28.s, z10.b, z3.b[2]\n" + "sdot z29.s, z10.b, z5.b[0]\n" + "sdot z27.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z4.b[0]\n" + "sdot z25.s, z10.b, z4.b[2]\n" + "sdot z24.s, z10.b, z6.b[0]\n" + "sdot z23.s, z10.b, z6.b[2]\n" + "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n" + "sdot z30.s, z11.b, z3.b[1]\n" + "sdot z28.s, z11.b, z3.b[3]\n" + "sdot z29.s, z11.b, z5.b[1]\n" + "sdot z27.s, z11.b, z5.b[3]\n" + "sdot z26.s, z11.b, z4.b[1]\n" + "sdot z25.s, z11.b, z4.b[3]\n" + "sdot z24.s, z11.b, z6.b[1]\n" + "sdot z23.s, z11.b, z6.b[3]\n" + "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "sdot z30.s, z8.b, z5.b[0]\n" + "sdot z28.s, z8.b, z5.b[2]\n" + "sdot z29.s, z8.b, z4.b[0]\n" + "sdot z27.s, z8.b, z4.b[2]\n" + "sdot z26.s, z8.b, z6.b[0]\n" + "sdot z25.s, z8.b, z6.b[2]\n" + "sdot z24.s, z8.b, z7.b[0]\n" + "sdot z23.s, z8.b, z7.b[2]\n" + "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n" + "sdot z30.s, z9.b, z5.b[1]\n" + "sdot z28.s, z9.b, z5.b[3]\n" + "sdot z29.s, z9.b, z4.b[1]\n" + "sdot z27.s, z9.b, z4.b[3]\n" + "sdot z26.s, z9.b, z6.b[1]\n" + "sdot z25.s, z9.b, z6.b[3]\n" + "sdot z24.s, z9.b, z7.b[1]\n" + "sdot z23.s, z9.b, z7.b[3]\n" + "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "sdot z30.s, z10.b, z4.b[0]\n" + "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n" + "sdot z28.s, z10.b, z4.b[2]\n" + "sdot z29.s, z10.b, z6.b[0]\n" + "sdot z27.s, z10.b, z6.b[2]\n" + "sdot z26.s, z10.b, z7.b[0]\n" + "sdot z25.s, z10.b, z7.b[2]\n" + "sdot z24.s, z10.b, z1.b[0]\n" + "sdot z23.s, z10.b, z1.b[2]\n" + "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n" + "sdot z30.s, z11.b, z4.b[1]\n" + "sdot z28.s, z11.b, z4.b[3]\n" + "sdot z29.s, z11.b, z6.b[1]\n" + "sdot z27.s, z11.b, z6.b[3]\n" + "sdot z26.s, z11.b, z7.b[1]\n" + "sdot z25.s, z11.b, z7.b[3]\n" + "sdot z24.s, z11.b, z1.b[1]\n" + "sdot z23.s, z11.b, z1.b[3]\n" + "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n" + "sdot z30.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z6.b[2]\n" + "sdot z29.s, z8.b, z7.b[0]\n" + "sdot z27.s, z8.b, z7.b[2]\n" + "sdot z26.s, z8.b, z1.b[0]\n" + "sdot z25.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z23.s, z8.b, z2.b[2]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n" + "sdot z30.s, z9.b, z6.b[1]\n" + "sdot z28.s, z9.b, z6.b[3]\n" + "sdot z29.s, z9.b, z7.b[1]\n" + "sdot z27.s, z9.b, z7.b[3]\n" + "sdot z26.s, z9.b, z1.b[1]\n" + "sdot z25.s, z9.b, z1.b[3]\n" + "sdot z24.s, z9.b, z2.b[1]\n" + "sdot z23.s, z9.b, z2.b[3]\n" + "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n" + "addvl %x[params], %x[params], #-3\n" + ".inst 0x04b677de // sqrdmulh z30.s, z30.s, z22.s\n" + ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n" + ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n" + ".inst 0x04b6777b // sqrdmulh z27.s, z27.s, z22.s\n" + ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n" + "and z20.d, z30.d, z21.d\n" + "asr z20.s, z20.s, #0x1f\n" + "and z19.d, z28.d, z21.d\n" + "and z18.d, z29.d, z21.d\n" + "asr z19.s, z19.s, #0x1f\n" + "and z17.d, z27.d, z21.d\n" + "and z16.d, z26.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z30.s, z30.s, z20.s\n" + ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n" + "sqadd z28.s, z28.s, z19.s\n" + "sqadd z29.s, z29.s, z18.s\n" + "and z18.d, z25.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + "sqadd z27.s, z27.s, z17.s\n" + "sqadd z26.s, z26.s, z16.s\n" + "and z17.d, z24.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "and z16.d, z23.d, z21.d\n" + ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z25.s, z25.s, z18.s\n" + ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n" + "add z30.s, z30.s, z14.s\n" + "sqadd z24.s, z24.s, z17.s\n" + ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n" + "add z28.s, z28.s, z14.s\n" + "sqadd z23.s, z23.s, z16.s\n" + "smin z30.s, p2/M, z30.s, z12.s\n" + "add z29.s, z29.s, z14.s\n" + "smin z28.s, p2/M, z28.s, z12.s\n" + ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n" + "smax z30.s, p2/M, z30.s, z31.s\n" + "st1b { z30.s }, p0, [x26, x27]\n" + "add z27.s, z27.s, z14.s\n" + "smax z28.s, p2/M, z28.s, z31.s\n" + "ld1w { z30.s }, p2/Z, [SP]\n" + "smin z29.s, p2/M, z29.s, z12.s\n" + "st1b { z28.s }, p0, [x25, x27]\n" + "add z30.s, z30.s, z13.s\n" + "smin z27.s, p2/M, z27.s, z12.s\n" + "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n" + "smax z29.s, p2/M, z29.s, z31.s\n" + "st1b { z29.s }, p0, [x24, x27]\n" + "add z28.s, z28.s, z13.s\n" + "smax z27.s, p2/M, z27.s, z31.s\n" + "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n" + "st1b { z27.s }, p0, [x23, x27]\n" + "add z29.s, z29.s, z13.s\n" + ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n" + "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n" + "add z26.s, z26.s, z14.s\n" + ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n" + ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n" + "add z25.s, z25.s, z14.s\n" + "add z27.s, z27.s, z13.s\n" + "add z24.s, z24.s, z14.s\n" + "add z23.s, z23.s, z14.s\n" + "smin z26.s, p2/M, z26.s, z12.s\n" + "smin z25.s, p2/M, z25.s, z12.s\n" + "smin z24.s, p2/M, z24.s, z12.s\n" + "smin z23.s, p2/M, z23.s, z12.s\n" + "smax z26.s, p2/M, z26.s, z31.s\n" + "st1b { z26.s }, p0, [x22, x27]\n" + "smax z25.s, p2/M, z25.s, z31.s\n" + "smax z24.s, p2/M, z24.s, z31.s\n" + "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n" + "smax z23.s, p2/M, z23.s, z31.s\n" + "st1b { z25.s }, p0, [x21, x27]\n" + "add z26.s, z26.s, z13.s\n" + "st1b { z24.s }, p0, [x20, x27]\n" + "st1b { z23.s }, p0, [x19, x27]\n" + "incw x27\n" + "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z13.s\n" + "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n" + "add z24.s, z24.s, z13.s\n" + "add z23.s, z23.s, z13.s\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..4098f6f660 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + +struct sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst +{ + typedef int32_t bias_type; + typedef int8_t input_type; + typedef int8_t weight_type; + typedef int8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size; + + kern_type kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + + sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..3345449fe1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_SVE) + +#include "arm_gemm.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +{ + __asm__ __volatile__( + "ldp x11, x10, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "ldp x9, x28, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "ldp x27, x26, [%x[inptrs], #0x20]\n" + "mov x25, #0x0\n" + "ldp x24, x23, [%x[inptrs], #0x30]\n" + "whilelt p1.b, x25, %x[n_channels]\n" + "ldp x22, x21, [%x[outptrs], #0x0]\n" + "ldp x20, x19, [%x[outptrs], #0x10]\n" + "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "1:" // Loop + "ld1b { z19.b }, p1/Z, [x11, x25]\n" + "whilelt p0.s, x25, %x[n_channels]\n" + "ld1b { z18.b }, p1/Z, [x10, x25]\n" + "ldp x11, x10, [%x[inptrs], #0x40]\n" + "ld1b { z16.b }, p1/Z, [x9, x25]\n" + "zip1 z21.b, z19.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x28, x25]\n" + "zip2 z19.b, z19.b, z16.b\n" + "ldp x9, x28, [%x[inptrs], #0x50]\n" + "ld1b { z23.b }, p1/Z, [x27, x25]\n" + "zip1 z16.b, z18.b, z17.b\n" + "ld1b { z20.b }, p1/Z, [x26, x25]\n" + "zip2 z18.b, z18.b, z17.b\n" + "ldp x27, x26, [%x[inptrs], #0x60]\n" + "zip1 z3.b, z21.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x24, x25]\n" + "zip2 z2.b, z21.b, z16.b\n" + "ld1b { z16.b }, p1/Z, [x23, x25]\n" + "zip1 z29.b, z19.b, z18.b\n" + "ldp x24, x23, [%x[inptrs], #0x70]\n" + "zip2 z28.b, z19.b, z18.b\n" + "ld1b { z22.b }, p1/Z, [x11, x25]\n" + "zip1 z19.b, z23.b, z17.b\n" + "ld1b { z21.b }, p1/Z, [x10, x25]\n" + "zip2 z27.b, z23.b, z17.b\n" + "ldp x11, x10, [%x[inptrs], #0x0]\n" + "zip1 z18.b, z20.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x9, x25]\n" + "zip2 z20.b, z20.b, z16.b\n" + "ld1b { z16.b }, p1/Z, [x28, x25]\n" + "zip1 z1.b, z19.b, z18.b\n" + "ldp x9, x28, [%x[inptrs], #0x10]\n" + "zip2 z0.b, z19.b, z18.b\n" + "ld1b { z19.b }, p1/Z, [x27, x25]\n" + "zip1 z26.b, z22.b, z17.b\n" + "ld1b { z25.b }, p1/Z, [x26, x25]\n" + "zip2 z24.b, z22.b, z17.b\n" + "ldp x27, x26, [%x[inptrs], #0x20]\n" + "zip1 z23.b, z21.b, z16.b\n" + "ld1b { z18.b }, p1/Z, [x24, x25]\n" + "zip2 z22.b, z21.b, z16.b\n" + "ld1b { z21.b }, p1/Z, [x23, x25]\n" + "zip1 z17.b, z27.b, z20.b\n" + "ldp x24, x23, [%x[inptrs], #0x30]\n" + "zip2 z16.b, z27.b, z20.b\n" + "st1b { z29.b }, p2, [SP]\n" + "zip1 z20.b, z19.b, z18.b\n" + "st1b { z28.b }, p2, [SP, #1, MUL VL]\n" + "zip2 z19.b, z19.b, z18.b\n" + "st1b { z17.b }, p2, [SP, #2, MUL VL]\n" + "zip1 z18.b, z25.b, z21.b\n" + "st1b { z16.b }, p2, [SP, #3, MUL VL]\n" + "zip2 z17.b, z25.b, z21.b\n" + "ld1w { z31.s }, p2/Z, [%x[params]]\n" + "zip1 z30.b, z26.b, z23.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n" + "zip2 z28.b, z26.b, z23.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #2, MUL VL]\n" + "zip1 z16.b, z24.b, z22.b\n" + "st1b { z16.b }, p2, [SP, #4, MUL VL]\n" + "zip2 z16.b, z24.b, z22.b\n" + "st1b { z16.b }, p2, [SP, #5, MUL VL]\n" + "zip1 z26.b, z20.b, z18.b\n" + "ld1b { z25.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "zip2 z24.b, z20.b, z18.b\n" + "ld1w { z23.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p2, [SP, #6, MUL VL]\n" + "zip2 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p2, [SP, #7, MUL VL]\n" + "mov z22.d, z31.d\n" + "ld1w { z21.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "mov z20.d, z31.d\n" + "mov z19.d, z31.d\n" + "sdot z31.s, z29.b, z3.b\n" + "sdot z20.s, z29.b, z1.b\n" + "ext z3.b, z3.b, z3.b, #0x1\n" + "sdot z31.s, z27.b, z1.b\n" + "ext z1.b, z1.b, z1.b, #0x1\n" + "sdot z20.s, z27.b, z30.b\n" + "sdot z22.s, z29.b, z3.b\n" + "ld1b { z3.b }, p2/Z, [SP]\n" + "sdot z31.s, z25.b, z30.b\n" + "ext z30.b, z30.b, z30.b, #0x1\n" + "sdot z20.s, z25.b, z26.b\n" + "ext z26.b, z26.b, z26.b, #0x1\n" + "sdot z19.s, z29.b, z1.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #7, MUL VL]\n" + "sdot z22.s, z27.b, z1.b\n" + "ld1b { z1.b }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" + ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" + "sdot z19.s, z27.b, z30.b\n" + "sdot z22.s, z25.b, z30.b\n" + "ld1b { z30.b }, p2/Z, [SP, #4, MUL VL]\n" + "and z16.d, z31.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sdot z19.s, z25.b, z26.b\n" + "ld1b { z26.b }, p2/Z, [SP, #6, MUL VL]\n" + ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" + "and z18.d, z20.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" + "sqadd z31.s, z31.s, z16.s\n" + "and z17.d, z22.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "and z16.d, z19.d, z21.d\n" + "sqadd z20.s, z20.s, z18.s\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n" + "sqadd z22.s, z22.s, z17.s\n" + ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n" + "add z31.s, z31.s, z4.s\n" + "sqadd z19.s, z19.s, z16.s\n" + "add z20.s, z20.s, z4.s\n" + ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n" + "smax z31.s, p2/M, z31.s, z6.s\n" + "smax z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z5.s\n" + "st1b { z31.s }, p0, [x22, x25]\n" + "add z19.s, z19.s, z4.s\n" + "smax z22.s, p2/M, z22.s, z6.s\n" + "ld1w { z31.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "smin z20.s, p2/M, z20.s, z5.s\n" + "ld1b { z27.b }, p2/Z, [%x[params], #-8, MUL VL]\n" + "ld1b { z25.b }, p2/Z, [%x[params], #-7, MUL VL]\n" + "smax z19.s, p2/M, z19.s, z6.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #-6, MUL VL]\n" + "smin z22.s, p2/M, z22.s, z5.s\n" + "ld1w { z21.s }, p2/Z, [%x[params], #-5, MUL VL]\n" + "smin z19.s, p2/M, z19.s, z5.s\n" + "st1b { z20.s }, p0, [x20, x25]\n" + "mov z20.d, z31.d\n" + "st1b { z22.s }, p0, [x21, x25]\n" + "mov z22.d, z31.d\n" + "st1b { z19.s }, p0, [x19, x25]\n" + "mov z19.d, z31.d\n" + "incw x25\n" + "sdot z31.s, z29.b, z2.b\n" + "whilelt p0.s, x25, %x[n_channels]\n" + "sdot z20.s, z29.b, z0.b\n" + "ext z2.b, z2.b, z2.b, #0x1\n" + "sdot z31.s, z27.b, z0.b\n" + "sdot z20.s, z27.b, z28.b\n" + "ext z0.b, z0.b, z0.b, #0x1\n" + "sdot z22.s, z29.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [SP, #1, MUL VL]\n" + "sdot z31.s, z25.b, z28.b\n" + "sdot z20.s, z25.b, z24.b\n" + "ext z28.b, z28.b, z28.b, #0x1\n" + "ext z24.b, z24.b, z24.b, #0x1\n" + "sdot z19.s, z29.b, z0.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #-3, MUL VL]\n" + "sdot z22.s, z27.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [SP, #3, MUL VL]\n" + ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" + ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" + "sdot z19.s, z27.b, z28.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #-2, MUL VL]\n" + "sdot z22.s, z25.b, z28.b\n" + "ld1b { z28.b }, p2/Z, [SP, #5, MUL VL]\n" + "and z16.d, z31.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sdot z19.s, z25.b, z24.b\n" + "ld1b { z25.b }, p2/Z, [%x[params], #-1, MUL VL]\n" + ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" + "ld1b { z24.b }, p2/Z, [SP, #7, MUL VL]\n" + "and z18.d, z20.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params]]\n" + "sqadd z31.s, z31.s, z16.s\n" + "and z17.d, z22.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "and z16.d, z19.d, z21.d\n" + "sqadd z20.s, z20.s, z18.s\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n" + "sqadd z22.s, z22.s, z17.s\n" + ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n" + "add z31.s, z31.s, z4.s\n" + "sqadd z19.s, z19.s, z16.s\n" + "add z20.s, z20.s, z4.s\n" + ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n" + "smax z31.s, p2/M, z31.s, z6.s\n" + "smax z20.s, p2/M, z20.s, z6.s\n" + ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n" + "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "add z22.s, z22.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z5.s\n" + "st1b { z31.s }, p0, [x22, x25]\n" + "add z19.s, z19.s, z4.s\n" + "smax z22.s, p2/M, z22.s, z6.s\n" + "ld1w { z31.s }, p2/Z, [%x[params], #-4, MUL VL]\n" + "smin z20.s, p2/M, z20.s, z5.s\n" + "st1b { z20.s }, p0, [x20, x25]\n" + "mov z20.d, z31.d\n" + "smin z22.s, p2/M, z22.s, z5.s\n" + "st1b { z22.s }, p0, [x21, x25]\n" + "mov z22.d, z31.d\n" + "sdot z20.s, z29.b, z1.b\n" + "smax z19.s, p2/M, z19.s, z6.s\n" + "sdot z20.s, z27.b, z30.b\n" + "smin z19.s, p2/M, z19.s, z5.s\n" + "st1b { z19.s }, p0, [x19, x25]\n" + "mov z19.d, z31.d\n" + "incw x25\n" + "sdot z31.s, z29.b, z3.b\n" + "whilelt p0.s, x25, %x[n_channels]\n" + "sdot z20.s, z25.b, z26.b\n" + "ext z3.b, z3.b, z3.b, #0x1\n" + "ext z26.b, z26.b, z26.b, #0x1\n" + "sdot z31.s, z27.b, z1.b\n" + "ext z1.b, z1.b, z1.b, #0x1\n" + "sdot z22.s, z29.b, z3.b\n" + ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" + "sdot z31.s, z25.b, z30.b\n" + "ext z30.b, z30.b, z30.b, #0x1\n" + "sdot z19.s, z29.b, z1.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "sdot z22.s, z27.b, z1.b\n" + "and z18.d, z20.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + "sdot z19.s, z27.b, z30.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #4, MUL VL]\n" + "sdot z22.s, z25.b, z30.b\n" + ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" + "sdot z19.s, z25.b, z26.b\n" + "ld1b { z25.b }, p2/Z, [%x[params], #5, MUL VL]\n" + "and z16.d, z31.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" + "sqadd z20.s, z20.s, z18.s\n" + ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "and z17.d, z22.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z31.s, z31.s, z16.s\n" + "and z16.d, z19.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n" + ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n" + "sqadd z22.s, z22.s, z17.s\n" + "add z20.s, z20.s, z4.s\n" + "add z31.s, z31.s, z4.s\n" + "sqadd z19.s, z19.s, z16.s\n" + ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n" + "smax z20.s, p2/M, z20.s, z6.s\n" + "smax z31.s, p2/M, z31.s, z6.s\n" + ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n" + "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "add z22.s, z22.s, z4.s\n" + "smin z20.s, p2/M, z20.s, z5.s\n" + "st1b { z20.s }, p0, [x20, x25]\n" + "add z19.s, z19.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z5.s\n" + "st1b { z31.s }, p0, [x22, x25]\n" + "smax z22.s, p2/M, z22.s, z6.s\n" + "smax z19.s, p2/M, z19.s, z6.s\n" + "ld1w { z31.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "addvl %x[params], %x[params], #8\n" + "mov z20.d, z31.d\n" + "smin z22.s, p2/M, z22.s, z5.s\n" + "st1b { z22.s }, p0, [x21, x25]\n" + "mov z22.d, z31.d\n" + "sdot z20.s, z29.b, z0.b\n" + "smin z19.s, p2/M, z19.s, z5.s\n" + "st1b { z19.s }, p0, [x19, x25]\n" + "mov z19.d, z31.d\n" + "incw x25\n" + "sdot z31.s, z29.b, z2.b\n" + "whilelt p0.s, x25, %x[n_channels]\n" + "sdot z20.s, z27.b, z28.b\n" + "ext z2.b, z2.b, z2.b, #0x1\n" + "sdot z31.s, z27.b, z0.b\n" + "sdot z20.s, z25.b, z24.b\n" + "ext z0.b, z0.b, z0.b, #0x1\n" + "ext z24.b, z24.b, z24.b, #0x1\n" + "sdot z22.s, z29.b, z2.b\n" + "sdot z31.s, z25.b, z28.b\n" + "ext z28.b, z28.b, z28.b, #0x1\n" + "sdot z19.s, z29.b, z0.b\n" + "sdot z22.s, z27.b, z0.b\n" + ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n" + ".inst 0x04b77694 // sqrdmulh z20.s, z20.s, z23.s\n" + "sdot z19.s, z27.b, z28.b\n" + "sdot z22.s, z25.b, z28.b\n" + "and z16.d, z31.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sdot z19.s, z25.b, z24.b\n" + ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n" + "and z18.d, z20.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z17.d, z22.d, z21.d\n" + ".inst 0x04b77673 // sqrdmulh z19.s, z19.s, z23.s\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z31.s, z31.s, z16.s\n" + "and z16.d, z19.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z20.s, z20.s, z18.s\n" + ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n" + "sqadd z22.s, z22.s, z17.s\n" + "add z31.s, z31.s, z4.s\n" + ".inst 0x44828ab4 // srshl z20.s, p2/M, z20.s, z21.s\n" + "sqadd z19.s, z19.s, z16.s\n" + ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n" + "smax z31.s, p2/M, z31.s, z6.s\n" + "add z20.s, z20.s, z4.s\n" + ".inst 0x44828ab3 // srshl z19.s, p2/M, z19.s, z21.s\n" + "add z22.s, z22.s, z4.s\n" + "smin z31.s, p2/M, z31.s, z5.s\n" + "st1b { z31.s }, p0, [x22, x25]\n" + "add z19.s, z19.s, z4.s\n" + "smax z22.s, p2/M, z22.s, z6.s\n" + "smax z20.s, p2/M, z20.s, z6.s\n" + "smax z19.s, p2/M, z19.s, z6.s\n" + "smin z22.s, p2/M, z22.s, z5.s\n" + "st1b { z22.s }, p0, [x21, x25]\n" + "smin z20.s, p2/M, z20.s, z5.s\n" + "smin z19.s, p2/M, z19.s, z5.s\n" + "st1b { z20.s }, p0, [x20, x25]\n" + "st1b { z19.s }, p0, [x19, x25]\n" + "incw x25\n" + "whilelt p1.b, x25, %x[n_channels]\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..72b26a50a0 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + +struct sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst +{ + typedef uint32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_dot::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_dot::get_packed_size; + + kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + + sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..ca6af57171 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_SVE) + +#include "arm_gemm.hpp" +#include + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +{ + __asm__ __volatile__( + "ldp x11, x10, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "ldp x9, x28, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "ldp x27, x26, [%x[inptrs], #0x20]\n" + "mov x19, #0x1\n" + "ldp x25, x24, [%x[inptrs], #0x30]\n" + "orr x19, x19, #0x100\n" + "ldp x23, x22, [%x[outptrs], #0x0]\n" + "orr x19, x19, #0x10000\n" + "dup z12.s, w19\n" + "ldp x21, x20, [%x[outptrs], #0x10]\n" + "mov x19, #0x0\n" + "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "whilelt p1.b, x19, %x[n_channels]\n" + "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n" + "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "1:" // Loop + "mov z7.s, #0x0\n" + "ld1b { z19.b }, p1/Z, [x11, x19]\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "mov z6.s, #0x0\n" + "ld1b { z18.b }, p1/Z, [x10, x19]\n" + "ldp x11, x10, [%x[inptrs], #0x40]\n" + "ld1b { z16.b }, p1/Z, [x9, x19]\n" + "zip1 z21.b, z19.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x28, x19]\n" + "zip2 z19.b, z19.b, z16.b\n" + "ldp x9, x28, [%x[inptrs], #0x50]\n" + "ld1b { z23.b }, p1/Z, [x27, x19]\n" + "zip1 z16.b, z18.b, z17.b\n" + "ld1b { z20.b }, p1/Z, [x26, x19]\n" + "zip2 z18.b, z18.b, z17.b\n" + "ldp x27, x26, [%x[inptrs], #0x60]\n" + "zip1 z5.b, z21.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x25, x19]\n" + "zip2 z4.b, z21.b, z16.b\n" + "ld1b { z16.b }, p1/Z, [x24, x19]\n" + "zip1 z29.b, z19.b, z18.b\n" + "ldp x25, x24, [%x[inptrs], #0x70]\n" + "zip2 z28.b, z19.b, z18.b\n" + "ld1b { z22.b }, p1/Z, [x11, x19]\n" + "zip1 z19.b, z23.b, z17.b\n" + "ld1b { z21.b }, p1/Z, [x10, x19]\n" + "zip2 z27.b, z23.b, z17.b\n" + "ldp x11, x10, [%x[inptrs], #0x0]\n" + "zip1 z18.b, z20.b, z16.b\n" + "ld1b { z17.b }, p1/Z, [x9, x19]\n" + "zip2 z20.b, z20.b, z16.b\n" + "ld1b { z16.b }, p1/Z, [x28, x19]\n" + "zip1 z3.b, z19.b, z18.b\n" + "ldp x9, x28, [%x[inptrs], #0x10]\n" + "zip2 z2.b, z19.b, z18.b\n" + "ld1b { z19.b }, p1/Z, [x27, x19]\n" + "zip1 z26.b, z22.b, z17.b\n" + "ld1b { z25.b }, p1/Z, [x26, x19]\n" + "zip2 z24.b, z22.b, z17.b\n" + "ldp x27, x26, [%x[inptrs], #0x20]\n" + "zip1 z23.b, z21.b, z16.b\n" + "ld1b { z18.b }, p1/Z, [x25, x19]\n" + "zip2 z22.b, z21.b, z16.b\n" + "ld1b { z21.b }, p1/Z, [x24, x19]\n" + "zip1 z17.b, z27.b, z20.b\n" + "ldp x25, x24, [%x[inptrs], #0x30]\n" + "zip2 z16.b, z27.b, z20.b\n" + "st1b { z29.b }, p2, [SP]\n" + "zip1 z20.b, z19.b, z18.b\n" + "st1b { z28.b }, p2, [SP, #1, MUL VL]\n" + "zip2 z19.b, z19.b, z18.b\n" + "st1b { z17.b }, p2, [SP, #2, MUL VL]\n" + "zip1 z18.b, z25.b, z21.b\n" + "st1b { z16.b }, p2, [SP, #3, MUL VL]\n" + "zip2 z17.b, z25.b, z21.b\n" + "ld1w { z1.s }, p2/Z, [%x[params]]\n" + "zip1 z0.b, z26.b, z23.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #1, MUL VL]\n" + "zip2 z30.b, z26.b, z23.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #2, MUL VL]\n" + "zip1 z16.b, z24.b, z22.b\n" + "st1b { z16.b }, p2, [SP, #4, MUL VL]\n" + "zip2 z16.b, z24.b, z22.b\n" + "st1b { z16.b }, p2, [SP, #5, MUL VL]\n" + "zip1 z28.b, z20.b, z18.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "zip2 z26.b, z20.b, z18.b\n" + "ld1w { z25.s }, p2/Z, [%x[params], #4, MUL VL]\n" + "zip1 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p2, [SP, #6, MUL VL]\n" + "zip2 z16.b, z19.b, z17.b\n" + "st1b { z16.b }, p2, [SP, #7, MUL VL]\n" + "mov z24.d, z1.d\n" + "ld1w { z23.s }, p2/Z, [%x[params], #5, MUL VL]\n" + "mov z22.d, z1.d\n" + "mov z21.d, z1.d\n" + "udot z1.s, z31.b, z5.b\n" + "udot z22.s, z31.b, z3.b\n" + "udot z7.s, z12.b, z3.b\n" + "udot z1.s, z29.b, z3.b\n" + "ext z3.b, z3.b, z3.b, #0x1\n" + "udot z22.s, z29.b, z0.b\n" + "udot z7.s, z12.b, z0.b\n" + "udot z1.s, z27.b, z0.b\n" + "ext z0.b, z0.b, z0.b, #0x1\n" + "udot z22.s, z27.b, z28.b\n" + "mov z20.d, z7.d\n" + "udot z7.s, z12.b, z5.b\n" + "udot z20.s, z12.b, z28.b\n" + "ext z5.b, z5.b, z5.b, #0x1\n" + "ext z28.b, z28.b, z28.b, #0x1\n" + "udot z21.s, z31.b, z3.b\n" + "udot z6.s, z12.b, z3.b\n" + "udot z24.s, z31.b, z5.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #7, MUL VL]\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "udot z21.s, z29.b, z0.b\n" + "udot z6.s, z12.b, z0.b\n" + "udot z24.s, z29.b, z3.b\n" + "ld1b { z3.b }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "udot z21.s, z27.b, z28.b\n" + "mov z19.d, z6.d\n" + "udot z24.s, z27.b, z0.b\n" + "ld1b { z0.b }, p2/Z, [SP, #4, MUL VL]\n" + "udot z6.s, z12.b, z5.b\n" + "ld1b { z5.b }, p2/Z, [SP]\n" + "udot z19.s, z12.b, z28.b\n" + "ld1b { z28.b }, p2/Z, [SP, #6, MUL VL]\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "mov z7.s, #0x0\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "udot z7.s, z12.b, z2.b\n" + "mov z6.s, #0x0\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "sqadd z1.s, z1.s, z16.s\n" + "udot z7.s, z12.b, z30.b\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z17.d, z22.d, z23.d\n" + "mov z20.d, z7.d\n" + "asr z17.s, z17.s, #0x1f\n" + "udot z7.s, z12.b, z4.b\n" + "udot z20.s, z12.b, z26.b\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + "sqadd z24.s, z24.s, z18.s\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "sqadd z22.s, z22.s, z17.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + "add z1.s, z1.s, z8.s\n" + "and z16.d, z21.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "add z24.s, z24.s, z8.s\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z22.s, z22.s, z8.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "ld1b { z29.b }, p2/Z, [%x[params], #-8, MUL VL]\n" + "ld1b { z27.b }, p2/Z, [%x[params], #-7, MUL VL]\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + "ld1w { z25.s }, p2/Z, [%x[params], #-6, MUL VL]\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #-5, MUL VL]\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "mov z24.d, z1.d\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "add z21.s, z21.s, z8.s\n" + "mov z22.d, z1.d\n" + "udot z22.s, z31.b, z2.b\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "udot z22.s, z29.b, z30.b\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "mov z21.d, z1.d\n" + "incw x19\n" + "udot z1.s, z31.b, z4.b\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "udot z22.s, z27.b, z26.b\n" + "ext z4.b, z4.b, z4.b, #0x1\n" + "ext z26.b, z26.b, z26.b, #0x1\n" + "udot z1.s, z29.b, z2.b\n" + "ext z2.b, z2.b, z2.b, #0x1\n" + "udot z24.s, z31.b, z4.b\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + "udot z1.s, z27.b, z30.b\n" + "ext z30.b, z30.b, z30.b, #0x1\n" + "udot z21.s, z31.b, z2.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #-3, MUL VL]\n" + "udot z24.s, z29.b, z2.b\n" + "udot z6.s, z12.b, z2.b\n" + "ld1b { z2.b }, p2/Z, [SP, #3, MUL VL]\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + "udot z21.s, z29.b, z30.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #-2, MUL VL]\n" + "udot z24.s, z27.b, z30.b\n" + "udot z6.s, z12.b, z30.b\n" + "ld1b { z30.b }, p2/Z, [SP, #5, MUL VL]\n" + "and z17.d, z22.d, z23.d\n" + "asr z17.s, z17.s, #0x1f\n" + "udot z21.s, z27.b, z26.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #-1, MUL VL]\n" + "mov z19.d, z6.d\n" + "udot z6.s, z12.b, z4.b\n" + "ld1b { z4.b }, p2/Z, [SP, #1, MUL VL]\n" + "udot z19.s, z12.b, z26.b\n" + "ld1b { z26.b }, p2/Z, [SP, #7, MUL VL]\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "mov z7.s, #0x0\n" + "sqadd z22.s, z22.s, z17.s\n" + "udot z7.s, z12.b, z3.b\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "mov z6.s, #0x0\n" + "udot z7.s, z12.b, z0.b\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "mov z20.d, z7.d\n" + "udot z7.s, z12.b, z5.b\n" + "udot z20.s, z12.b, z28.b\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "sqadd z1.s, z1.s, z16.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + "ld1w { z25.s }, p2/Z, [%x[params]]\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + "and z16.d, z21.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z24.s, z24.s, z18.s\n" + "add z22.s, z22.s, z8.s\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + "add z1.s, z1.s, z8.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "add z24.s, z24.s, z8.s\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z21.s, z21.s, z8.s\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "mov z24.d, z1.d\n" + "mov z22.d, z1.d\n" + "udot z22.s, z31.b, z3.b\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "mov z21.d, z1.d\n" + "incw x19\n" + "udot z1.s, z31.b, z5.b\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "udot z22.s, z29.b, z0.b\n" + "ext z5.b, z5.b, z5.b, #0x1\n" + "udot z1.s, z29.b, z3.b\n" + "udot z22.s, z27.b, z28.b\n" + "ext z3.b, z3.b, z3.b, #0x1\n" + "ext z28.b, z28.b, z28.b, #0x1\n" + "udot z24.s, z31.b, z5.b\n" + "udot z1.s, z27.b, z0.b\n" + "ext z0.b, z0.b, z0.b, #0x1\n" + "udot z21.s, z31.b, z3.b\n" + "ld1b { z31.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "udot z24.s, z29.b, z3.b\n" + "udot z6.s, z12.b, z3.b\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "udot z21.s, z29.b, z0.b\n" + "ld1b { z29.b }, p2/Z, [%x[params], #4, MUL VL]\n" + "udot z24.s, z27.b, z0.b\n" + "udot z6.s, z12.b, z0.b\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "udot z21.s, z27.b, z28.b\n" + "ld1b { z27.b }, p2/Z, [%x[params], #5, MUL VL]\n" + "mov z7.s, #0x0\n" + "mov z19.d, z6.d\n" + "udot z6.s, z12.b, z5.b\n" + "udot z19.s, z12.b, z28.b\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + "udot z7.s, z12.b, z2.b\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "mov z6.s, #0x0\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "sqadd z1.s, z1.s, z16.s\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + "ld1w { z25.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z17.d, z22.d, z23.d\n" + "and z16.d, z21.d, z23.d\n" + "asr z17.s, z17.s, #0x1f\n" + "udot z7.s, z12.b, z30.b\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z24.s, z24.s, z18.s\n" + "add z1.s, z1.s, z8.s\n" + "mov z20.d, z7.d\n" + "sqadd z22.s, z22.s, z17.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "udot z7.s, z12.b, z4.b\n" + "udot z20.s, z12.b, z26.b\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "ld1w { z23.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z24.s, z24.s, z8.s\n" + "add z22.s, z22.s, z8.s\n" + "ld1w { z1.s }, p2/Z, [%x[params], #2, MUL VL]\n" + "addvl %x[params], %x[params], #8\n" + "add z21.s, z21.s, z8.s\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "mov z24.d, z1.d\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "mov z22.d, z1.d\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "mov z21.d, z1.d\n" + "incw x19\n" + "udot z1.s, z31.b, z4.b\n" + "whilelt p0.s, x19, %x[n_channels]\n" + "udot z22.s, z31.b, z2.b\n" + "ext z4.b, z4.b, z4.b, #0x1\n" + "udot z1.s, z29.b, z2.b\n" + "udot z22.s, z29.b, z30.b\n" + "ext z2.b, z2.b, z2.b, #0x1\n" + "udot z24.s, z31.b, z4.b\n" + "udot z1.s, z27.b, z30.b\n" + "udot z22.s, z27.b, z26.b\n" + "ext z30.b, z30.b, z30.b, #0x1\n" + "ext z26.b, z26.b, z26.b, #0x1\n" + "udot z21.s, z31.b, z2.b\n" + "udot z24.s, z29.b, z2.b\n" + "udot z6.s, z12.b, z2.b\n" + "mls z1.s, p2/M, z7.s, z9.s\n" + "udot z21.s, z29.b, z30.b\n" + "udot z24.s, z27.b, z30.b\n" + "udot z6.s, z12.b, z30.b\n" + ".inst 0x04b97421 // sqrdmulh z1.s, z1.s, z25.s\n" + "udot z21.s, z27.b, z26.b\n" + "mls z22.s, p2/M, z20.s, z9.s\n" + "mov z19.d, z6.d\n" + "udot z6.s, z12.b, z4.b\n" + "udot z19.s, z12.b, z26.b\n" + "and z16.d, z1.d, z23.d\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b976d6 // sqrdmulh z22.s, z22.s, z25.s\n" + "mls z24.s, p2/M, z6.s, z9.s\n" + "mls z21.s, p2/M, z19.s, z9.s\n" + ".inst 0x04b97718 // sqrdmulh z24.s, z24.s, z25.s\n" + "and z17.d, z22.d, z23.d\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z1.s, z1.s, z16.s\n" + ".inst 0x04b976b5 // sqrdmulh z21.s, z21.s, z25.s\n" + "and z18.d, z24.d, z23.d\n" + "asr z18.s, z18.s, #0x1f\n" + "and z16.d, z21.d, z23.d\n" + ".inst 0x44828ae1 // srshl z1.s, p2/M, z1.s, z23.s\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z22.s, z22.s, z17.s\n" + "add z1.s, z1.s, z8.s\n" + "sqadd z24.s, z24.s, z18.s\n" + "smax z1.s, p2/M, z1.s, z11.s\n" + ".inst 0x44828af6 // srshl z22.s, p2/M, z22.s, z23.s\n" + "sqadd z21.s, z21.s, z16.s\n" + ".inst 0x44828af8 // srshl z24.s, p2/M, z24.s, z23.s\n" + "add z22.s, z22.s, z8.s\n" + "smin z1.s, p2/M, z1.s, z10.s\n" + "st1b { z1.s }, p0, [x23, x19]\n" + "add z24.s, z24.s, z8.s\n" + "smax z22.s, p2/M, z22.s, z11.s\n" + ".inst 0x44828af5 // srshl z21.s, p2/M, z21.s, z23.s\n" + "smax z24.s, p2/M, z24.s, z11.s\n" + "smin z22.s, p2/M, z22.s, z10.s\n" + "st1b { z22.s }, p0, [x21, x19]\n" + "add z21.s, z21.s, z8.s\n" + "smin z24.s, p2/M, z24.s, z10.s\n" + "st1b { z24.s }, p0, [x22, x19]\n" + "smax z21.s, p2/M, z21.s, z11.s\n" + "smin z21.s, p2/M, z21.s, z10.s\n" + "st1b { z21.s }, p0, [x20, x19]\n" + "incw x19\n" + "whilelt p1.b, x19, %x[n_channels]\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..6174dd0e9f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size; + + kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + + sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..2ec7f6e7ea --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const uint8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x17, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "mov x15, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x14, #0x0\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z12.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z18.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z13.s }, p4/Z, [x20]\n" + "whilelt p3.h, x15, x17\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "whilelt p2.s, x15, x17\n" + "ldp x10, x9, [x21, #0x0]\n" + "mov x19, x15\n" + "incw x19\n" + "ldp x28, x27, [x21, #0x10]\n" + "whilelt p1.s, x19, x17\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z17.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z17.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z17.s, z17.s, z16.s\n" + "mov z9.d, z11.d\n" + "ld1b { z0.h }, p4/Z, [x16]\n" + ".inst 0x45521800 // usublb z0.h, z0.b, z18.b\n" + "mov z20.d, z17.d\n" + "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n" + "mov z24.d, z11.d\n" + "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n" + ".inst 0x45521821 // usublb z1.h, z1.b, z18.b\n" + "mov z19.d, z17.d\n" + "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n" + "mov z26.d, z11.d\n" + "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n" + ".inst 0x45521842 // usublb z2.h, z2.b, z18.b\n" + "mov z23.d, z17.d\n" + "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n" + ".inst 0x45521863 // usublb z3.h, z3.b, z18.b\n" + "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n" + "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n" + ".inst 0x45521884 // usublb z4.h, z4.b, z18.b\n" + "inch x16, ALL, MUL #8\n" + "ld1b { z8.h }, p4/Z, [x16]\n" + "ldp x23, x22, [x12, #0x0]\n" + ".inst 0x455218a5 // usublb z5.h, z5.b, z18.b\n" + ".inst 0x455218c6 // usublb z6.h, z6.b, z18.b\n" + "ldp x21, x20, [x12, #0x10]\n" + ".inst 0x455218e7 // usublb z7.h, z7.b, z18.b\n" + ".inst 0x45521908 // usublb z8.h, z8.b, z18.b\n" + "ldr x19, [x12, #0x20]\n" + "ld1b { z31.h }, p3/Z, [x23, x15]\n" + ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n" + "ld1b { z30.h }, p3/Z, [x22, x15]\n" + "ld1b { z29.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n" + "ld1b { z28.h }, p3/Z, [x20, x15]\n" + "ld1b { z27.h }, p3/Z, [x19, x15]\n" + ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n" + ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n" + ".inst 0x454c1b7b // usublb z27.h, z27.b, z12.b\n" + "1:" // Loop + ".inst 0x448443eb // smlalb z11.s, p4/M, z31.h, z4.h\n" + "ldr x21, [x12, #0x28]\n" + "whilelt p0.h, x14, x17\n" + ".inst 0x448447f1 // smlalt z17.s, p4/M, z31.h, z4.h\n" + "ldr x20, [x12, #0x30]\n" + "inch x16\n" + ".inst 0x448343e9 // smlalb z9.s, p4/M, z31.h, z3.h\n" + "ldr x26, [x12, #0x38]\n" + ".inst 0x448347f4 // smlalt z20.s, p4/M, z31.h, z3.h\n" + "ldr x25, [x12, #0x40]\n" + ".inst 0x448143f8 // smlalb z24.s, p4/M, z31.h, z1.h\n" + "ldr x19, [x12, #0x48]\n" + ".inst 0x448147f3 // smlalt z19.s, p4/M, z31.h, z1.h\n" + "ldr x24, [x12, #0x50]\n" + ".inst 0x448043fa // smlalb z26.s, p4/M, z31.h, z0.h\n" + "ldr x23, [x12, #0x58]\n" + ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n" + "ld1b { z31.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n" + ".inst 0x448043cb // smlalb z11.s, p4/M, z30.h, z0.h\n" + "ldr x22, [x12, #0x60]\n" + ".inst 0x448047d1 // smlalt z17.s, p4/M, z30.h, z0.h\n" + "ld1b { z30.h }, p3/Z, [x19, x15]\n" + ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n" + ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n" + "ldr x21, [x12, #0x68]\n" + ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n" + "ld1b { z29.h }, p3/Z, [x20, x15]\n" + ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n" + ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n" + "ldr x20, [x12, #0x70]\n" + ".inst 0x44854791 // smlalt z17.s, p4/M, z28.h, z5.h\n" + "ldr x19, [x12, #0x78]\n" + ".inst 0x44844389 // smlalb z9.s, p4/M, z28.h, z4.h\n" + "ld1w { z25.s }, p2/Z, [x13]\n" + ".inst 0x44844794 // smlalt z20.s, p4/M, z28.h, z4.h\n" + "ld1w { z16.s }, p1/Z, [x13, #1, MUL VL]\n" + "addvl x13, x13, #2\n" + ".inst 0x44824398 // smlalb z24.s, p4/M, z28.h, z2.h\n" + ".inst 0x44824793 // smlalt z19.s, p4/M, z28.h, z2.h\n" + ".inst 0x4481439a // smlalb z26.s, p4/M, z28.h, z1.h\n" + "uzp1 z10.s, z25.s, z16.s\n" + "uzp2 z22.s, z25.s, z16.s\n" + "ld1w { z25.s }, p2/Z, [x11]\n" + ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n" + "ld1b { z28.h }, p3/Z, [x26, x15]\n" + ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n" + ".inst 0x448643f8 // smlalb z24.s, p4/M, z31.h, z6.h\n" + "ld1w { z16.s }, p1/Z, [x11, #1, MUL VL]\n" + ".inst 0x448647f3 // smlalt z19.s, p4/M, z31.h, z6.h\n" + "ld1b { z31.h }, p3/Z, [x25, x15]\n" + "addvl x11, x11, #2\n" + ".inst 0x4487436b // smlalb z11.s, p4/M, z27.h, z7.h\n" + ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n" + "uzp1 z21.s, z25.s, z16.s\n" + "uzp2 z25.s, z25.s, z16.s\n" + ".inst 0x44874771 // smlalt z17.s, p4/M, z27.h, z7.h\n" + ".inst 0x44864369 // smlalb z9.s, p4/M, z27.h, z6.h\n" + ".inst 0x44864774 // smlalt z20.s, p4/M, z27.h, z6.h\n" + ".inst 0x44844378 // smlalb z24.s, p4/M, z27.h, z4.h\n" + ".inst 0x44844773 // smlalt z19.s, p4/M, z27.h, z4.h\n" + ".inst 0x4483437a // smlalb z26.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834777 // smlalt z23.s, p4/M, z27.h, z3.h\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814791 // smlalt z17.s, p4/M, z28.h, z1.h\n" + ".inst 0x448843ba // smlalb z26.s, p4/M, z29.h, z8.h\n" + ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n" + "ld1b { z29.h }, p3/Z, [x24, x15]\n" + ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n" + ".inst 0x44804389 // smlalb z9.s, p4/M, z28.h, z0.h\n" + ".inst 0x44804794 // smlalt z20.s, p4/M, z28.h, z0.h\n" + "ld1b { z28.h }, p3/Z, [x23, x15]\n" + ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n" + ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247f1 // smlalt z17.s, p4/M, z31.h, z2.h\n" + ".inst 0x448143e9 // smlalb z9.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147f4 // smlalt z20.s, p4/M, z31.h, z1.h\n" + "ld1b { z31.h }, p3/Z, [x22, x15]\n" + ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n" + ".inst 0x448843cb // smlalb z11.s, p4/M, z30.h, z8.h\n" + ".inst 0x448847d1 // smlalt z17.s, p4/M, z30.h, z8.h\n" + ".inst 0x448743c9 // smlalb z9.s, p4/M, z30.h, z7.h\n" + ".inst 0x448747d4 // smlalt z20.s, p4/M, z30.h, z7.h\n" + ".inst 0x448543d8 // smlalb z24.s, p4/M, z30.h, z5.h\n" + ".inst 0x448547d3 // smlalt z19.s, p4/M, z30.h, z5.h\n" + ".inst 0x448443da // smlalb z26.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447d7 // smlalt z23.s, p4/M, z30.h, z4.h\n" + "ld1b { z30.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n" + ".inst 0x448343ab // smlalb z11.s, p4/M, z29.h, z3.h\n" + ".inst 0x448347b1 // smlalt z17.s, p4/M, z29.h, z3.h\n" + ".inst 0x448043b8 // smlalb z24.s, p4/M, z29.h, z0.h\n" + ".inst 0x448047b3 // smlalt z19.s, p4/M, z29.h, z0.h\n" + "ld1b { z29.h }, p3/Z, [x20, x15]\n" + ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n" + ".inst 0x44854389 // smlalb z9.s, p4/M, z28.h, z5.h\n" + ".inst 0x44854794 // smlalt z20.s, p4/M, z28.h, z5.h\n" + ".inst 0x4482439a // smlalb z26.s, p4/M, z28.h, z2.h\n" + ".inst 0x44824797 // smlalt z23.s, p4/M, z28.h, z2.h\n" + "ld1b { z28.h }, p3/Z, [x19, x15]\n" + "inch x15\n" + ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n" + "whilelt p2.s, x15, x17\n" + ".inst 0x448647f1 // smlalt z17.s, p4/M, z31.h, z6.h\n" + "mov x19, x15\n" + ".inst 0x448343f8 // smlalb z24.s, p4/M, z31.h, z3.h\n" + ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n" + ".inst 0x448347f3 // smlalt z19.s, p4/M, z31.h, z3.h\n" + "incw x19\n" + ".inst 0x448843c9 // smlalb z9.s, p4/M, z30.h, z8.h\n" + "whilelt p1.s, x19, x17\n" + ".inst 0x04aa756b // sqrdmulh z11.s, z11.s, z10.s\n" + "whilelt p3.h, x15, x17\n" + ".inst 0x04b67631 // sqrdmulh z17.s, z17.s, z22.s\n" + ".inst 0x448847d4 // smlalt z20.s, p4/M, z30.h, z8.h\n" + ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n" + "and z16.d, z11.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "and z1.d, z17.d, z25.d\n" + "and z27.d, z9.d, z21.d\n" + "asr z1.s, z1.s, #0x1f\n" + ".inst 0x04b67694 // sqrdmulh z20.s, z20.s, z22.s\n" + ".inst 0x448543da // smlalb z26.s, p4/M, z30.h, z5.h\n" + "asr z27.s, z27.s, #0x1f\n" + ".inst 0x448547d7 // smlalt z23.s, p4/M, z30.h, z5.h\n" + "sqadd z11.s, z11.s, z16.s\n" + ".inst 0x448743b8 // smlalb z24.s, p4/M, z29.h, z7.h\n" + "and z16.d, z20.d, z25.d\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z17.s, z17.s, z1.s\n" + "sqadd z9.s, z9.s, z27.s\n" + ".inst 0x448747b3 // smlalt z19.s, p4/M, z29.h, z7.h\n" + ".inst 0x448643ba // smlalb z26.s, p4/M, z29.h, z6.h\n" + ".inst 0x448647b7 // smlalt z23.s, p4/M, z29.h, z6.h\n" + ".inst 0x44884398 // smlalb z24.s, p4/M, z28.h, z8.h\n" + "sqadd z20.s, z20.s, z16.s\n" + ".inst 0x44884793 // smlalt z19.s, p4/M, z28.h, z8.h\n" + ".inst 0x4487439a // smlalb z26.s, p4/M, z28.h, z7.h\n" + ".inst 0x04aa7718 // sqrdmulh z24.s, z24.s, z10.s\n" + ".inst 0x44874797 // smlalt z23.s, p4/M, z28.h, z7.h\n" + ".inst 0x04b67673 // sqrdmulh z19.s, z19.s, z22.s\n" + ".inst 0x04aa775a // sqrdmulh z26.s, z26.s, z10.s\n" + "and z16.d, z24.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "and z7.d, z19.d, z25.d\n" + "and z3.d, z26.d, z21.d\n" + "asr z7.s, z7.s, #0x1f\n" + ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n" + ".inst 0x448292ab // srshl z11.s, p4/M, z11.s, z21.s\n" + "asr z3.s, z3.s, #0x1f\n" + ".inst 0x44829331 // srshl z17.s, p4/M, z17.s, z25.s\n" + "sqadd z24.s, z24.s, z16.s\n" + ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n" + "add z11.s, z11.s, z15.s\n" + "add z17.s, z17.s, z15.s\n" + "sqadd z19.s, z19.s, z7.s\n" + "add z9.s, z9.s, z15.s\n" + "sqadd z26.s, z26.s, z3.s\n" + "and z16.d, z23.d, z25.d\n" + "asr z16.s, z16.s, #0x1f\n" + "smin z11.s, p4/M, z11.s, z14.s\n" + "smin z17.s, p4/M, z17.s, z14.s\n" + "smin z9.s, p4/M, z9.s, z14.s\n" + ".inst 0x44829334 // srshl z20.s, p4/M, z20.s, z25.s\n" + ".inst 0x448292b8 // srshl z24.s, p4/M, z24.s, z21.s\n" + "smax z11.s, p4/M, z11.s, z13.s\n" + "sqadd z23.s, z23.s, z16.s\n" + "add z20.s, z20.s, z15.s\n" + "add z24.s, z24.s, z15.s\n" + "smax z17.s, p4/M, z17.s, z13.s\n" + "smax z9.s, p4/M, z9.s, z13.s\n" + "smin z20.s, p4/M, z20.s, z14.s\n" + "smin z24.s, p4/M, z24.s, z14.s\n" + "trn1 z11.h, z11.h, z17.h\n" + "st1b { z11.h }, p0, [x10, x14]\n" + "smax z20.s, p4/M, z20.s, z13.s\n" + ".inst 0x44829333 // srshl z19.s, p4/M, z19.s, z25.s\n" + "smax z24.s, p4/M, z24.s, z13.s\n" + ".inst 0x448292ba // srshl z26.s, p4/M, z26.s, z21.s\n" + ".inst 0x44829337 // srshl z23.s, p4/M, z23.s, z25.s\n" + "trn1 z9.h, z9.h, z20.h\n" + "st1b { z9.h }, p0, [x9, x14]\n" + "add z19.s, z19.s, z15.s\n" + "add z26.s, z26.s, z15.s\n" + "add z23.s, z23.s, z15.s\n" + "smin z19.s, p4/M, z19.s, z14.s\n" + "smin z26.s, p4/M, z26.s, z14.s\n" + "smin z23.s, p4/M, z23.s, z14.s\n" + "smax z19.s, p4/M, z19.s, z13.s\n" + "smax z26.s, p4/M, z26.s, z13.s\n" + "smax z23.s, p4/M, z23.s, z13.s\n" + "trn1 z24.h, z24.h, z19.h\n" + "st1b { z24.h }, p0, [x28, x14]\n" + "trn1 z26.h, z26.h, z23.h\n" + "st1b { z26.h }, p0, [x27, x14]\n" + "inch x14\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z17.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z17.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z17.s, z17.s, z16.s\n" + "mov z9.d, z11.d\n" + "ld1b { z0.h }, p4/Z, [x16]\n" + ".inst 0x45521800 // usublb z0.h, z0.b, z18.b\n" + "mov z20.d, z17.d\n" + "ld1b { z1.h }, p4/Z, [x16, #1, MUL VL]\n" + "mov z24.d, z11.d\n" + "ld1b { z2.h }, p4/Z, [x16, #2, MUL VL]\n" + ".inst 0x45521821 // usublb z1.h, z1.b, z18.b\n" + "mov z19.d, z17.d\n" + "ld1b { z3.h }, p4/Z, [x16, #3, MUL VL]\n" + "mov z26.d, z11.d\n" + "ld1b { z4.h }, p4/Z, [x16, #4, MUL VL]\n" + ".inst 0x45521842 // usublb z2.h, z2.b, z18.b\n" + "mov z23.d, z17.d\n" + "ld1b { z5.h }, p4/Z, [x16, #5, MUL VL]\n" + ".inst 0x45521863 // usublb z3.h, z3.b, z18.b\n" + "ld1b { z6.h }, p4/Z, [x16, #6, MUL VL]\n" + "ld1b { z7.h }, p4/Z, [x16, #7, MUL VL]\n" + ".inst 0x45521884 // usublb z4.h, z4.b, z18.b\n" + "inch x16, ALL, MUL #8\n" + "ld1b { z8.h }, p4/Z, [x16]\n" + "ldp x23, x22, [x12, #0x0]\n" + ".inst 0x455218a5 // usublb z5.h, z5.b, z18.b\n" + ".inst 0x455218c6 // usublb z6.h, z6.b, z18.b\n" + "ldp x21, x20, [x12, #0x10]\n" + ".inst 0x455218e7 // usublb z7.h, z7.b, z18.b\n" + ".inst 0x45521908 // usublb z8.h, z8.b, z18.b\n" + "ldr x19, [x12, #0x20]\n" + "ld1b { z31.h }, p3/Z, [x23, x15]\n" + ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n" + "ld1b { z30.h }, p3/Z, [x22, x15]\n" + "ld1b { z29.h }, p3/Z, [x21, x15]\n" + ".inst 0x454c1bde // usublb z30.h, z30.b, z12.b\n" + "ld1b { z28.h }, p3/Z, [x20, x15]\n" + "ld1b { z27.h }, p3/Z, [x19, x15]\n" + ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n" + ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n" + ".inst 0x454c1b7b // usublb z27.h, z27.b, z12.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..1f470f78aa --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size; + + kern_type kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + + sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..bc8f0ac1d9 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const uint8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x5, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x6, [%x[params], %[offsetof_Params_weights]]\n" + "mov x7, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x8, #0x0\n" + "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x16, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z19.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z12.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z20.s }, p4/Z, [x20]\n" + "whilelt p3.h, x7, x5\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "whilelt p2.s, x7, x5\n" + "ldp x14, x13, [x21, #0x0]\n" + "mov x19, x7\n" + "incw x19\n" + "ldp x12, x11, [x21, #0x10]\n" + "whilelt p1.s, x19, x5\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z18.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z18.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z18.s, z16.s\n" + "mov z11.d, z13.d\n" + "ld1b { z0.h }, p4/Z, [x6]\n" + ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n" + "mov z9.d, z16.d\n" + "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n" + "mov z18.d, z13.d\n" + "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n" + ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n" + "mov z10.d, z16.d\n" + "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n" + "mov z22.d, z13.d\n" + "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n" + ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n" + "mov z23.d, z16.d\n" + "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n" + ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n" + "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n" + "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n" + ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n" + "inch x6, ALL, MUL #8\n" + "ld1b { z8.h }, p4/Z, [x6]\n" + "ldp x26, x25, [x16, #0x0]\n" + ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n" + ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n" + "ldp x24, x23, [x16, #0x10]\n" + ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n" + ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ld1b { z31.h }, p3/Z, [x26, x7]\n" + ".inst 0x45531bff // usublb z31.h, z31.b, z19.b\n" + "ld1b { z30.h }, p3/Z, [x25, x7]\n" + "ld1b { z29.h }, p3/Z, [x24, x7]\n" + ".inst 0x45531bde // usublb z30.h, z30.b, z19.b\n" + "ld1b { z28.h }, p3/Z, [x23, x7]\n" + "ld1b { z27.h }, p3/Z, [x22, x7]\n" + ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n" + "ld1b { z26.h }, p3/Z, [x21, x7]\n" + ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n" + "ld1b { z25.h }, p3/Z, [x20, x7]\n" + "ld1b { z24.h }, p3/Z, [x19, x7]\n" + ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n" + ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n" + ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n" + ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n" + "1:" // Loop + ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n" + "ldr x23, [x16, #0x40]\n" + "whilelt p0.h, x8, x5\n" + ".inst 0x448847f0 // smlalt z16.s, p4/M, z31.h, z8.h\n" + "ldr x22, [x16, #0x48]\n" + "inch x6\n" + ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n" + "ldr x21, [x16, #0x50]\n" + ".inst 0x448647e9 // smlalt z9.s, p4/M, z31.h, z6.h\n" + "ldr x20, [x16, #0x58]\n" + ".inst 0x448243f2 // smlalb z18.s, p4/M, z31.h, z2.h\n" + "ldr x19, [x16, #0x60]\n" + ".inst 0x448247ea // smlalt z10.s, p4/M, z31.h, z2.h\n" + "ldr x10, [x16, #0x68]\n" + ".inst 0x448043f6 // smlalb z22.s, p4/M, z31.h, z0.h\n" + "ldr x9, [x16, #0x70]\n" + ".inst 0x448047f7 // smlalt z23.s, p4/M, z31.h, z0.h\n" + "ldr x28, [x16, #0x78]\n" + ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n" + "ldr x27, [x16, #0x80]\n" + ".inst 0x448047d0 // smlalt z16.s, p4/M, z30.h, z0.h\n" + "ldr x26, [x16, #0x88]\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + "ldr x25, [x16, #0x90]\n" + ".inst 0x44814789 // smlalt z9.s, p4/M, z28.h, z1.h\n" + "ld1b { z28.h }, p3/Z, [x22, x7]\n" + ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n" + ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n" + "ldr x24, [x16, #0x98]\n" + ".inst 0x448147b0 // smlalt z16.s, p4/M, z29.h, z1.h\n" + "ld1b { z29.h }, p3/Z, [x23, x7]\n" + ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n" + ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n" + "ldr x23, [x16, #0xa0]\n" + ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n" + "ld1b { z27.h }, p3/Z, [x21, x7]\n" + ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n" + ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n" + "ldr x22, [x16, #0xa8]\n" + ".inst 0x44834750 // smlalt z16.s, p4/M, z26.h, z3.h\n" + "ld1b { z26.h }, p3/Z, [x20, x7]\n" + ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n" + ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n" + "ldr x21, [x16, #0xb0]\n" + ".inst 0x44844730 // smlalt z16.s, p4/M, z25.h, z4.h\n" + "ld1b { z25.h }, p3/Z, [x19, x7]\n" + ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n" + ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n" + "ldr x20, [x16, #0xb8]\n" + ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n" + "ldr x19, [x16, #0xc0]\n" + ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n" + "ld1w { z21.s }, p2/Z, [x17]\n" + ".inst 0x44804709 // smlalt z9.s, p4/M, z24.h, z0.h\n" + "ld1b { z24.h }, p3/Z, [x9, x7]\n" + ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n" + ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n" + "ld1w { z17.s }, p1/Z, [x17, #1, MUL VL]\n" + ".inst 0x448447a9 // smlalt z9.s, p4/M, z29.h, z4.h\n" + "ld1b { z29.h }, p3/Z, [x10, x7]\n" + "addvl x17, x17, #2\n" + ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n" + ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n" + "uzp1 z30.s, z21.s, z17.s\n" + "uzp2 z31.s, z21.s, z17.s\n" + "ld1w { z21.s }, p2/Z, [x15]\n" + ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n" + "ld1w { z17.s }, p1/Z, [x15, #1, MUL VL]\n" + "addvl x15, x15, #2\n" + ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n" + "ld1b { z28.h }, p3/Z, [x27, x7]\n" + ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n" + ".inst 0x44854770 // smlalt z16.s, p4/M, z27.h, z5.h\n" + ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834769 // smlalt z9.s, p4/M, z27.h, z3.h\n" + "ld1b { z27.h }, p3/Z, [x28, x7]\n" + ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n" + ".inst 0x44834352 // smlalb z18.s, p4/M, z26.h, z3.h\n" + ".inst 0x4483474a // smlalt z10.s, p4/M, z26.h, z3.h\n" + "ld1b { z26.h }, p3/Z, [x26, x7]\n" + ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n" + ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n" + ".inst 0x44864730 // smlalt z16.s, p4/M, z25.h, z6.h\n" + ".inst 0x44804332 // smlalb z18.s, p4/M, z25.h, z0.h\n" + ".inst 0x4480472a // smlalt z10.s, p4/M, z25.h, z0.h\n" + "ld1b { z25.h }, p3/Z, [x25, x7]\n" + ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n" + "uzp1 z0.s, z21.s, z17.s\n" + "uzp2 z21.s, z21.s, z17.s\n" + ".inst 0x448443b2 // smlalb z18.s, p4/M, z29.h, z4.h\n" + ".inst 0x448447aa // smlalt z10.s, p4/M, z29.h, z4.h\n" + "ld1b { z29.h }, p3/Z, [x24, x7]\n" + ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n" + ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n" + ".inst 0x44874710 // smlalt z16.s, p4/M, z24.h, z7.h\n" + ".inst 0x44814312 // smlalb z18.s, p4/M, z24.h, z1.h\n" + ".inst 0x4481470a // smlalt z10.s, p4/M, z24.h, z1.h\n" + "ld1b { z24.h }, p3/Z, [x22, x7]\n" + ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n" + ".inst 0x04be75ad // sqrdmulh z13.s, z13.s, z30.s\n" + ".inst 0x04bf7610 // sqrdmulh z16.s, z16.s, z31.s\n" + ".inst 0x44844376 // smlalb z22.s, p4/M, z27.h, z4.h\n" + ".inst 0x44844777 // smlalt z23.s, p4/M, z27.h, z4.h\n" + "ld1b { z27.h }, p3/Z, [x23, x7]\n" + ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n" + "and z4.d, z13.d, z0.d\n" + "and z17.d, z16.d, z21.d\n" + "asr z4.s, z4.s, #0x1f\n" + ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n" + ".inst 0x44874789 // smlalt z9.s, p4/M, z28.h, z7.h\n" + "asr z17.s, z17.s, #0x1f\n" + ".inst 0x44814396 // smlalb z22.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814797 // smlalt z23.s, p4/M, z28.h, z1.h\n" + ".inst 0x44864332 // smlalb z18.s, p4/M, z25.h, z6.h\n" + ".inst 0x4486472a // smlalt z10.s, p4/M, z25.h, z6.h\n" + "ld1b { z25.h }, p3/Z, [x20, x7]\n" + ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n" + "sqadd z13.s, z13.s, z4.s\n" + "sqadd z16.s, z16.s, z17.s\n" + ".inst 0x44854356 // smlalb z22.s, p4/M, z26.h, z5.h\n" + ".inst 0x44854757 // smlalt z23.s, p4/M, z26.h, z5.h\n" + "ld1b { z26.h }, p3/Z, [x21, x7]\n" + ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n" + ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n" + ".inst 0x448847a9 // smlalt z9.s, p4/M, z29.h, z8.h\n" + ".inst 0x448243b6 // smlalb z22.s, p4/M, z29.h, z2.h\n" + ".inst 0x448247b7 // smlalt z23.s, p4/M, z29.h, z2.h\n" + "ld1b { z29.h }, p3/Z, [x19, x7]\n" + "inch x7\n" + ".inst 0x04be756b // sqrdmulh z11.s, z11.s, z30.s\n" + "whilelt p2.s, x7, x5\n" + ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n" + "mov x19, x7\n" + ".inst 0x44874372 // smlalb z18.s, p4/M, z27.h, z7.h\n" + ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n" + ".inst 0x4487476a // smlalt z10.s, p4/M, z27.h, z7.h\n" + "incw x19\n" + ".inst 0x44834316 // smlalb z22.s, p4/M, z24.h, z3.h\n" + "whilelt p1.s, x19, x5\n" + "and z1.d, z11.d, z0.d\n" + "whilelt p3.h, x7, x5\n" + "and z17.d, z9.d, z21.d\n" + "asr z1.s, z1.s, #0x1f\n" + ".inst 0x44854312 // smlalb z18.s, p4/M, z24.h, z5.h\n" + ".inst 0x4485470a // smlalt z10.s, p4/M, z24.h, z5.h\n" + "asr z17.s, z17.s, #0x1f\n" + ".inst 0x44834717 // smlalt z23.s, p4/M, z24.h, z3.h\n" + ".inst 0x44874356 // smlalb z22.s, p4/M, z26.h, z7.h\n" + ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n" + ".inst 0x44884332 // smlalb z18.s, p4/M, z25.h, z8.h\n" + "sqadd z11.s, z11.s, z1.s\n" + "sqadd z9.s, z9.s, z17.s\n" + "add z13.s, z13.s, z14.s\n" + ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n" + ".inst 0x44874757 // smlalt z23.s, p4/M, z26.h, z7.h\n" + ".inst 0x4488472a // smlalt z10.s, p4/M, z25.h, z8.h\n" + ".inst 0x44864336 // smlalb z22.s, p4/M, z25.h, z6.h\n" + "and z17.d, z18.d, z0.d\n" + "asr z17.s, z17.s, #0x1f\n" + ".inst 0x04bf754a // sqrdmulh z10.s, z10.s, z31.s\n" + ".inst 0x44864737 // smlalt z23.s, p4/M, z25.h, z6.h\n" + ".inst 0x448843b6 // smlalb z22.s, p4/M, z29.h, z8.h\n" + "smin z13.s, p4/M, z13.s, z15.s\n" + ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n" + "and z1.d, z10.d, z21.d\n" + "asr z1.s, z1.s, #0x1f\n" + "add z16.s, z16.s, z14.s\n" + "sqadd z18.s, z18.s, z17.s\n" + ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n" + ".inst 0x448847b7 // smlalt z23.s, p4/M, z29.h, z8.h\n" + "smax z13.s, p4/M, z13.s, z20.s\n" + "smin z16.s, p4/M, z16.s, z15.s\n" + "sqadd z10.s, z10.s, z1.s\n" + "and z2.d, z22.d, z0.d\n" + "asr z2.s, z2.s, #0x1f\n" + ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n" + "smax z16.s, p4/M, z16.s, z20.s\n" + ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n" + ".inst 0x448292a9 // srshl z9.s, p4/M, z9.s, z21.s\n" + ".inst 0x44829012 // srshl z18.s, p4/M, z18.s, z0.s\n" + "trn1 z13.h, z13.h, z16.h\n" + "st1b { z13.h }, p0, [x14, x8]\n" + "add z11.s, z11.s, z14.s\n" + "add z9.s, z9.s, z14.s\n" + "add z18.s, z18.s, z14.s\n" + "sqadd z22.s, z22.s, z2.s\n" + "and z16.d, z23.d, z21.d\n" + "asr z16.s, z16.s, #0x1f\n" + "smin z11.s, p4/M, z11.s, z15.s\n" + "smin z9.s, p4/M, z9.s, z15.s\n" + "smin z18.s, p4/M, z18.s, z15.s\n" + ".inst 0x448292aa // srshl z10.s, p4/M, z10.s, z21.s\n" + ".inst 0x44829016 // srshl z22.s, p4/M, z22.s, z0.s\n" + "smax z11.s, p4/M, z11.s, z20.s\n" + "sqadd z23.s, z23.s, z16.s\n" + "add z10.s, z10.s, z14.s\n" + "add z22.s, z22.s, z14.s\n" + "smax z9.s, p4/M, z9.s, z20.s\n" + "smax z18.s, p4/M, z18.s, z20.s\n" + "smin z10.s, p4/M, z10.s, z15.s\n" + "smin z22.s, p4/M, z22.s, z15.s\n" + "trn1 z11.h, z11.h, z9.h\n" + "st1b { z11.h }, p0, [x13, x8]\n" + "smax z10.s, p4/M, z10.s, z20.s\n" + ".inst 0x448292b7 // srshl z23.s, p4/M, z23.s, z21.s\n" + "smax z22.s, p4/M, z22.s, z20.s\n" + "trn1 z18.h, z18.h, z10.h\n" + "st1b { z18.h }, p0, [x12, x8]\n" + "add z23.s, z23.s, z14.s\n" + "smin z23.s, p4/M, z23.s, z15.s\n" + "smax z23.s, p4/M, z23.s, z20.s\n" + "trn1 z22.h, z22.h, z23.h\n" + "st1b { z22.h }, p0, [x11, x8]\n" + "inch x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z18.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z18.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z18.s, z16.s\n" + "mov z11.d, z13.d\n" + "ld1b { z0.h }, p4/Z, [x6]\n" + ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n" + "mov z9.d, z16.d\n" + "ld1b { z1.h }, p4/Z, [x6, #1, MUL VL]\n" + "mov z18.d, z13.d\n" + "ld1b { z2.h }, p4/Z, [x6, #2, MUL VL]\n" + ".inst 0x454c1821 // usublb z1.h, z1.b, z12.b\n" + "mov z10.d, z16.d\n" + "ld1b { z3.h }, p4/Z, [x6, #3, MUL VL]\n" + "mov z22.d, z13.d\n" + "ld1b { z4.h }, p4/Z, [x6, #4, MUL VL]\n" + ".inst 0x454c1842 // usublb z2.h, z2.b, z12.b\n" + "mov z23.d, z16.d\n" + "ld1b { z5.h }, p4/Z, [x6, #5, MUL VL]\n" + ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n" + "ld1b { z6.h }, p4/Z, [x6, #6, MUL VL]\n" + "ld1b { z7.h }, p4/Z, [x6, #7, MUL VL]\n" + ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n" + "inch x6, ALL, MUL #8\n" + "ld1b { z8.h }, p4/Z, [x6]\n" + "ldp x26, x25, [x16, #0x0]\n" + ".inst 0x454c18a5 // usublb z5.h, z5.b, z12.b\n" + ".inst 0x454c18c6 // usublb z6.h, z6.b, z12.b\n" + "ldp x24, x23, [x16, #0x10]\n" + ".inst 0x454c18e7 // usublb z7.h, z7.b, z12.b\n" + ".inst 0x454c1908 // usublb z8.h, z8.b, z12.b\n" + "ldp x22, x21, [x16, #0x20]\n" + "ldp x20, x19, [x16, #0x30]\n" + "ld1b { z31.h }, p3/Z, [x26, x7]\n" + ".inst 0x45531bff // usublb z31.h, z31.b, z19.b\n" + "ld1b { z30.h }, p3/Z, [x25, x7]\n" + "ld1b { z29.h }, p3/Z, [x24, x7]\n" + ".inst 0x45531bde // usublb z30.h, z30.b, z19.b\n" + "ld1b { z28.h }, p3/Z, [x23, x7]\n" + "ld1b { z27.h }, p3/Z, [x22, x7]\n" + ".inst 0x45531bbd // usublb z29.h, z29.b, z19.b\n" + "ld1b { z26.h }, p3/Z, [x21, x7]\n" + ".inst 0x45531b9c // usublb z28.h, z28.b, z19.b\n" + "ld1b { z25.h }, p3/Z, [x20, x7]\n" + "ld1b { z24.h }, p3/Z, [x19, x7]\n" + ".inst 0x45531b7b // usublb z27.h, z27.b, z19.b\n" + ".inst 0x45531b5a // usublb z26.h, z26.b, z19.b\n" + ".inst 0x45531b39 // usublb z25.h, z25.b, z19.b\n" + ".inst 0x45531b18 // usublb z24.h, z24.b, z19.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..f025b08a29 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_5x5_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_5x5_mla::get_packed_size; + + kern_type kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + + sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..95423186b8 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const uint8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x1, [%x[params], %[offsetof_Params_weights]]\n" + "mov x2, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x3, #0x0\n" + "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x5, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z17.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z13.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z5.s }, p4/Z, [x20]\n" + "whilelt p3.h, x2, x0\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "whilelt p2.s, x2, x0\n" + "ldp x7, x8, [x21, #0x0]\n" + "mov x19, x2\n" + "incw x19\n" + "ldp x17, x16, [x21, #0x10]\n" + "whilelt p1.s, x19, x0\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z19.s }, p2/Z, [x19]\n" + "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z19.s, z6.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z19.s, z6.s\n" + "mov z19.d, z11.d\n" + "ld1b { z0.h }, p4/Z, [x1]\n" + ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n" + "mov z9.d, z16.d\n" + "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n" + "mov z7.d, z11.d\n" + "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n" + "mov z6.d, z16.d\n" + "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n" + "mov z12.d, z11.d\n" + "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n" + "mov z8.d, z16.d\n" + "ldp x28, x27, [x5, #0x0]\n" + ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n" + "ldp x26, x25, [x5, #0x10]\n" + ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n" + "ldp x24, x23, [x5, #0x20]\n" + "ldp x22, x21, [x5, #0x30]\n" + "ldp x20, x19, [x5, #0x40]\n" + "ld1b { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n" + "ld1b { z30.h }, p3/Z, [x27, x2]\n" + "ld1b { z29.h }, p3/Z, [x26, x2]\n" + ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n" + "ld1b { z28.h }, p3/Z, [x25, x2]\n" + "ld1b { z27.h }, p3/Z, [x24, x2]\n" + ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n" + "ld1b { z23.h }, p3/Z, [x23, x2]\n" + ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n" + "ld1b { z25.h }, p3/Z, [x22, x2]\n" + "ld1b { z24.h }, p3/Z, [x21, x2]\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + "ld1b { z26.h }, p3/Z, [x20, x2]\n" + ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n" + "ld1b { z22.h }, p3/Z, [x19, x2]\n" + ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n" + ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n" + ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n" + ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n" + "1:" // Loop + ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n" + "ldr x20, [x5, #0x50]\n" + "whilelt p0.h, x3, x0\n" + ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n" + "ldr x19, [x5, #0x58]\n" + ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n" + "ldr x25, [x5, #0x60]\n" + ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n" + "ld1b { z31.h }, p3/Z, [x20, x2]\n" + ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n" + ".inst 0x448043a7 // smlalb z7.s, p4/M, z29.h, z0.h\n" + "ldr x24, [x5, #0x68]\n" + ".inst 0x448047a6 // smlalt z6.s, p4/M, z29.h, z0.h\n" + "ldr x23, [x5, #0x70]\n" + ".inst 0x4480438c // smlalb z12.s, p4/M, z28.h, z0.h\n" + "ldr x22, [x5, #0x78]\n" + ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n" + "ld1b { z0.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n" + ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n" + "ldr x15, [x5, #0x80]\n" + ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n" + "ld1b { z30.h }, p3/Z, [x19, x2]\n" + ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n" + ".inst 0x44814373 // smlalb z19.s, p4/M, z27.h, z1.h\n" + "ldr x21, [x5, #0x88]\n" + ".inst 0x44814769 // smlalt z9.s, p4/M, z27.h, z1.h\n" + "ldr x20, [x5, #0x90]\n" + ".inst 0x44814387 // smlalb z7.s, p4/M, z28.h, z1.h\n" + "ldr x19, [x5, #0x98]\n" + ".inst 0x44814786 // smlalt z6.s, p4/M, z28.h, z1.h\n" + "ldr x14, [x5, #0xa0]\n" + ".inst 0x448142ec // smlalb z12.s, p4/M, z23.h, z1.h\n" + "ldr x13, [x5, #0xa8]\n" + ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n" + "ld1b { z1.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n" + ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n" + "ldr x12, [x5, #0xb0]\n" + ".inst 0x44824770 // smlalt z16.s, p4/M, z27.h, z2.h\n" + "ld1b { z27.h }, p3/Z, [x25, x2]\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n" + "ldr x11, [x5, #0xb8]\n" + ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n" + "ldr x10, [x5, #0xc0]\n" + ".inst 0x448242e7 // smlalb z7.s, p4/M, z23.h, z2.h\n" + "ldr x9, [x5, #0xc8]\n" + ".inst 0x448246e6 // smlalt z6.s, p4/M, z23.h, z2.h\n" + "ldr x28, [x5, #0xd0]\n" + ".inst 0x448243ec // smlalb z12.s, p4/M, z31.h, z2.h\n" + "ldr x27, [x5, #0xd8]\n" + ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n" + "ld1b { z2.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n" + "ldr x26, [x5, #0xe0]\n" + ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n" + ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n" + "ld1b { z25.h }, p3/Z, [x24, x2]\n" + ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n" + "ldr x25, [x5, #0xe8]\n" + ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n" + ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n" + "ld1w { z18.s }, p2/Z, [x4]\n" + ".inst 0x448343e7 // smlalb z7.s, p4/M, z31.h, z3.h\n" + "ld1w { z20.s }, p1/Z, [x4, #1, MUL VL]\n" + "addvl x4, x4, #2\n" + ".inst 0x448347e6 // smlalt z6.s, p4/M, z31.h, z3.h\n" + ".inst 0x448343cc // smlalb z12.s, p4/M, z30.h, z3.h\n" + ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n" + "ld1b { z3.h }, p4/Z, [x1]\n" + ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n" + "uzp1 z21.s, z18.s, z20.s\n" + "uzp2 z10.s, z18.s, z20.s\n" + "ld1w { z18.s }, p2/Z, [x6]\n" + ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n" + "ld1w { z20.s }, p1/Z, [x6, #1, MUL VL]\n" + "addvl x6, x6, #2\n" + ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n" + "ld1b { z24.h }, p3/Z, [x23, x2]\n" + ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n" + ".inst 0x44844373 // smlalb z19.s, p4/M, z27.h, z4.h\n" + "ldr x24, [x5, #0xf0]\n" + ".inst 0x44844769 // smlalt z9.s, p4/M, z27.h, z4.h\n" + "ld1b { z27.h }, p3/Z, [x22, x2]\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + ".inst 0x448443c7 // smlalb z7.s, p4/M, z30.h, z4.h\n" + "ldr x23, [x5, #0xf8]\n" + ".inst 0x448447c6 // smlalt z6.s, p4/M, z30.h, z4.h\n" + ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n" + "ld1b { z4.h }, p4/Z, [x1, #1, MUL VL]\n" + ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n" + ".inst 0x448043ab // smlalb z11.s, p4/M, z29.h, z0.h\n" + ".inst 0x448047b0 // smlalt z16.s, p4/M, z29.h, z0.h\n" + "uzp1 z29.s, z18.s, z20.s\n" + "uzp2 z20.s, z18.s, z20.s\n" + ".inst 0x44804393 // smlalb z19.s, p4/M, z28.h, z0.h\n" + ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n" + ".inst 0x448042c7 // smlalb z7.s, p4/M, z22.h, z0.h\n" + ".inst 0x448046c6 // smlalt z6.s, p4/M, z22.h, z0.h\n" + ".inst 0x4480432c // smlalb z12.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n" + "ld1b { z0.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814790 // smlalt z16.s, p4/M, z28.h, z1.h\n" + "ld1b { z28.h }, p3/Z, [x21, x2]\n" + ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n" + ".inst 0x448142f3 // smlalb z19.s, p4/M, z23.h, z1.h\n" + "ldr x22, [x5, #0x100]\n" + ".inst 0x448146e9 // smlalt z9.s, p4/M, z23.h, z1.h\n" + ".inst 0x44814327 // smlalb z7.s, p4/M, z25.h, z1.h\n" + ".inst 0x44814726 // smlalt z6.s, p4/M, z25.h, z1.h\n" + ".inst 0x4481430c // smlalb z12.s, p4/M, z24.h, z1.h\n" + ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n" + "ld1b { z1.h }, p4/Z, [x1, #3, MUL VL]\n" + ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n" + ".inst 0x448242eb // smlalb z11.s, p4/M, z23.h, z2.h\n" + ".inst 0x448246f0 // smlalt z16.s, p4/M, z23.h, z2.h\n" + "ld1b { z23.h }, p3/Z, [x15, x2]\n" + ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n" + ".inst 0x448243f3 // smlalb z19.s, p4/M, z31.h, z2.h\n" + "ldr x21, [x5, #0x108]\n" + ".inst 0x448247e9 // smlalt z9.s, p4/M, z31.h, z2.h\n" + ".inst 0x44824307 // smlalb z7.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824706 // smlalt z6.s, p4/M, z24.h, z2.h\n" + ".inst 0x4482436c // smlalb z12.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n" + "ld1b { z2.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n" + ".inst 0x448343eb // smlalb z11.s, p4/M, z31.h, z3.h\n" + ".inst 0x448347f0 // smlalt z16.s, p4/M, z31.h, z3.h\n" + "ld1b { z31.h }, p3/Z, [x20, x2]\n" + ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n" + ".inst 0x448343d3 // smlalb z19.s, p4/M, z30.h, z3.h\n" + "ldr x20, [x5, #0x110]\n" + ".inst 0x448347c9 // smlalt z9.s, p4/M, z30.h, z3.h\n" + ".inst 0x44834367 // smlalb z7.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834766 // smlalt z6.s, p4/M, z27.h, z3.h\n" + ".inst 0x448342ec // smlalb z12.s, p4/M, z23.h, z3.h\n" + ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n" + "ld1b { z3.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n" + ".inst 0x448443cb // smlalb z11.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447d0 // smlalt z16.s, p4/M, z30.h, z4.h\n" + "ld1b { z30.h }, p3/Z, [x19, x2]\n" + ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n" + ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n" + "ldr x19, [x5, #0x118]\n" + ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n" + "ld1b { z26.h }, p3/Z, [x14, x2]\n" + ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n" + ".inst 0x448442e7 // smlalb z7.s, p4/M, z23.h, z4.h\n" + ".inst 0x448446e6 // smlalt z6.s, p4/M, z23.h, z4.h\n" + ".inst 0x4484438c // smlalb z12.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n" + "ld1b { z4.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n" + ".inst 0x448042cb // smlalb z11.s, p4/M, z22.h, z0.h\n" + ".inst 0x448046d0 // smlalt z16.s, p4/M, z22.h, z0.h\n" + "ld1b { z22.h }, p3/Z, [x11, x2]\n" + ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n" + ".inst 0x44804333 // smlalb z19.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804729 // smlalt z9.s, p4/M, z25.h, z0.h\n" + ".inst 0x448043e7 // smlalb z7.s, p4/M, z31.h, z0.h\n" + ".inst 0x448047e6 // smlalt z6.s, p4/M, z31.h, z0.h\n" + ".inst 0x448043cc // smlalb z12.s, p4/M, z30.h, z0.h\n" + ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n" + "ld1b { z0.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4481432b // smlalb z11.s, p4/M, z25.h, z1.h\n" + ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n" + ".inst 0x44814730 // smlalt z16.s, p4/M, z25.h, z1.h\n" + "ld1b { z25.h }, p3/Z, [x13, x2]\n" + ".inst 0x44814313 // smlalb z19.s, p4/M, z24.h, z1.h\n" + ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n" + ".inst 0x44814709 // smlalt z9.s, p4/M, z24.h, z1.h\n" + ".inst 0x448143c7 // smlalb z7.s, p4/M, z30.h, z1.h\n" + ".inst 0x448147c6 // smlalt z6.s, p4/M, z30.h, z1.h\n" + ".inst 0x4481434c // smlalb z12.s, p4/M, z26.h, z1.h\n" + ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n" + "ld1b { z1.h }, p4/Z, [x1]\n" + ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n" + ".inst 0x4482430b // smlalb z11.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824710 // smlalt z16.s, p4/M, z24.h, z2.h\n" + "ld1b { z24.h }, p3/Z, [x12, x2]\n" + ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n" + ".inst 0x44824373 // smlalb z19.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824769 // smlalt z9.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824347 // smlalb z7.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824746 // smlalt z6.s, p4/M, z26.h, z2.h\n" + ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n" + ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n" + "ld1b { z2.h }, p4/Z, [x1, #1, MUL VL]\n" + ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n" + ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834770 // smlalt z16.s, p4/M, z27.h, z3.h\n" + "ld1b { z27.h }, p3/Z, [x10, x2]\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + ".inst 0x448342f3 // smlalb z19.s, p4/M, z23.h, z3.h\n" + ".inst 0x448346e9 // smlalt z9.s, p4/M, z23.h, z3.h\n" + ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n" + ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n" + "ld1b { z3.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n" + ".inst 0x448442eb // smlalb z11.s, p4/M, z23.h, z4.h\n" + ".inst 0x448446f0 // smlalt z16.s, p4/M, z23.h, z4.h\n" + "ld1b { z23.h }, p3/Z, [x9, x2]\n" + ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n" + ".inst 0x44844393 // smlalb z19.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n" + "ld1b { z28.h }, p3/Z, [x26, x2]\n" + ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n" + ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n" + ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n" + ".inst 0x448442cc // smlalb z12.s, p4/M, z22.h, z4.h\n" + ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n" + "ld1b { z4.h }, p4/Z, [x1, #3, MUL VL]\n" + ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n" + ".inst 0x448043eb // smlalb z11.s, p4/M, z31.h, z0.h\n" + ".inst 0x448047f0 // smlalt z16.s, p4/M, z31.h, z0.h\n" + "ld1b { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n" + ".inst 0x448043d3 // smlalb z19.s, p4/M, z30.h, z0.h\n" + ".inst 0x448047c9 // smlalt z9.s, p4/M, z30.h, z0.h\n" + ".inst 0x44804367 // smlalb z7.s, p4/M, z27.h, z0.h\n" + ".inst 0x44804766 // smlalt z6.s, p4/M, z27.h, z0.h\n" + ".inst 0x448042ec // smlalb z12.s, p4/M, z23.h, z0.h\n" + ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n" + "ld1b { z0.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n" + ".inst 0x448143cb // smlalb z11.s, p4/M, z30.h, z1.h\n" + ".inst 0x448147d0 // smlalt z16.s, p4/M, z30.h, z1.h\n" + "ld1b { z30.h }, p3/Z, [x27, x2]\n" + ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n" + ".inst 0x44814353 // smlalb z19.s, p4/M, z26.h, z1.h\n" + ".inst 0x44814749 // smlalt z9.s, p4/M, z26.h, z1.h\n" + ".inst 0x448142e7 // smlalb z7.s, p4/M, z23.h, z1.h\n" + ".inst 0x448146e6 // smlalt z6.s, p4/M, z23.h, z1.h\n" + ".inst 0x448143ec // smlalb z12.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n" + "ld1b { z1.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n" + ".inst 0x4482434b // smlalb z11.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824750 // smlalt z16.s, p4/M, z26.h, z2.h\n" + "ld1b { z26.h }, p3/Z, [x25, x2]\n" + ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n" + ".inst 0x44824333 // smlalb z19.s, p4/M, z25.h, z2.h\n" + ".inst 0x44824729 // smlalt z9.s, p4/M, z25.h, z2.h\n" + ".inst 0x448243e7 // smlalb z7.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247e6 // smlalt z6.s, p4/M, z31.h, z2.h\n" + ".inst 0x448243cc // smlalb z12.s, p4/M, z30.h, z2.h\n" + ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n" + "ld1b { z2.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n" + ".inst 0x4483432b // smlalb z11.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834730 // smlalt z16.s, p4/M, z25.h, z3.h\n" + "ld1b { z25.h }, p3/Z, [x24, x2]\n" + ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n" + ".inst 0x44834313 // smlalb z19.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834709 // smlalt z9.s, p4/M, z24.h, z3.h\n" + ".inst 0x448343c7 // smlalb z7.s, p4/M, z30.h, z3.h\n" + ".inst 0x448347c6 // smlalt z6.s, p4/M, z30.h, z3.h\n" + ".inst 0x4483438c // smlalb z12.s, p4/M, z28.h, z3.h\n" + ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n" + "ld1b { z3.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4484430b // smlalb z11.s, p4/M, z24.h, z4.h\n" + ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n" + ".inst 0x44844710 // smlalt z16.s, p4/M, z24.h, z4.h\n" + "ld1b { z24.h }, p3/Z, [x23, x2]\n" + ".inst 0x448442d3 // smlalb z19.s, p4/M, z22.h, z4.h\n" + ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n" + ".inst 0x448446c9 // smlalt z9.s, p4/M, z22.h, z4.h\n" + ".inst 0x44844387 // smlalb z7.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844786 // smlalt z6.s, p4/M, z28.h, z4.h\n" + ".inst 0x4484434c // smlalb z12.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n" + "ld1b { z4.h }, p4/Z, [x1]\n" + "inch x1\n" + ".inst 0x4480436b // smlalb z11.s, p4/M, z27.h, z0.h\n" + ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n" + ".inst 0x44804770 // smlalt z16.s, p4/M, z27.h, z0.h\n" + "ld1b { z27.h }, p3/Z, [x22, x2]\n" + ".inst 0x448042f3 // smlalb z19.s, p4/M, z23.h, z0.h\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + ".inst 0x448046e9 // smlalt z9.s, p4/M, z23.h, z0.h\n" + ".inst 0x44804327 // smlalb z7.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804726 // smlalt z6.s, p4/M, z25.h, z0.h\n" + "ld1b { z25.h }, p3/Z, [x21, x2]\n" + ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n" + ".inst 0x4480430c // smlalb z12.s, p4/M, z24.h, z0.h\n" + ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n" + ".inst 0x448142eb // smlalb z11.s, p4/M, z23.h, z1.h\n" + ".inst 0x448146f0 // smlalt z16.s, p4/M, z23.h, z1.h\n" + ".inst 0x448143f3 // smlalb z19.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n" + ".inst 0x44814307 // smlalb z7.s, p4/M, z24.h, z1.h\n" + ".inst 0x44814706 // smlalt z6.s, p4/M, z24.h, z1.h\n" + "ld1b { z24.h }, p3/Z, [x20, x2]\n" + ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n" + ".inst 0x4481436c // smlalb z12.s, p4/M, z27.h, z1.h\n" + ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n" + ".inst 0x448243eb // smlalb z11.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247f0 // smlalt z16.s, p4/M, z31.h, z2.h\n" + ".inst 0x448243d3 // smlalb z19.s, p4/M, z30.h, z2.h\n" + ".inst 0x448247c9 // smlalt z9.s, p4/M, z30.h, z2.h\n" + ".inst 0x44824367 // smlalb z7.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824766 // smlalt z6.s, p4/M, z27.h, z2.h\n" + "ld1b { z27.h }, p3/Z, [x19, x2]\n" + "inch x2\n" + ".inst 0x4482432c // smlalb z12.s, p4/M, z25.h, z2.h\n" + "whilelt p2.s, x2, x0\n" + ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n" + "mov x19, x2\n" + ".inst 0x448343cb // smlalb z11.s, p4/M, z30.h, z3.h\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + ".inst 0x448347d0 // smlalt z16.s, p4/M, z30.h, z3.h\n" + "incw x19\n" + ".inst 0x44834393 // smlalb z19.s, p4/M, z28.h, z3.h\n" + "whilelt p1.s, x19, x0\n" + ".inst 0x44834789 // smlalt z9.s, p4/M, z28.h, z3.h\n" + "whilelt p3.h, x2, x0\n" + ".inst 0x44834327 // smlalb z7.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834726 // smlalt z6.s, p4/M, z25.h, z3.h\n" + ".inst 0x4483430c // smlalb z12.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n" + ".inst 0x4484438b // smlalb z11.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844790 // smlalt z16.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844353 // smlalb z19.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844749 // smlalt z9.s, p4/M, z26.h, z4.h\n" + ".inst 0x04b5756b // sqrdmulh z11.s, z11.s, z21.s\n" + ".inst 0x04aa7610 // sqrdmulh z16.s, z16.s, z10.s\n" + ".inst 0x04b57673 // sqrdmulh z19.s, z19.s, z21.s\n" + ".inst 0x04aa7529 // sqrdmulh z9.s, z9.s, z10.s\n" + "and z31.d, z11.d, z29.d\n" + "asr z31.s, z31.s, #0x1f\n" + "and z23.d, z16.d, z20.d\n" + "and z25.d, z19.d, z29.d\n" + "asr z23.s, z23.s, #0x1f\n" + "and z18.d, z9.d, z20.d\n" + ".inst 0x44844307 // smlalb z7.s, p4/M, z24.h, z4.h\n" + "asr z25.s, z25.s, #0x1f\n" + ".inst 0x44844706 // smlalt z6.s, p4/M, z24.h, z4.h\n" + "asr z18.s, z18.s, #0x1f\n" + "sqadd z11.s, z11.s, z31.s\n" + ".inst 0x4484436c // smlalb z12.s, p4/M, z27.h, z4.h\n" + ".inst 0x04b574e7 // sqrdmulh z7.s, z7.s, z21.s\n" + "sqadd z16.s, z16.s, z23.s\n" + "sqadd z19.s, z19.s, z25.s\n" + ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n" + "sqadd z9.s, z9.s, z18.s\n" + "and z1.d, z7.d, z29.d\n" + "asr z1.s, z1.s, #0x1f\n" + "and z18.d, z6.d, z20.d\n" + ".inst 0x04b5758c // sqrdmulh z12.s, z12.s, z21.s\n" + "asr z18.s, z18.s, #0x1f\n" + ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n" + ".inst 0x448293ab // srshl z11.s, p4/M, z11.s, z29.s\n" + "and z30.d, z12.d, z29.d\n" + "asr z30.s, z30.s, #0x1f\n" + "add z11.s, z11.s, z14.s\n" + "sqadd z7.s, z7.s, z1.s\n" + "sqadd z6.s, z6.s, z18.s\n" + ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n" + "smin z11.s, p4/M, z11.s, z15.s\n" + ".inst 0x44829290 // srshl z16.s, p4/M, z16.s, z20.s\n" + "sqadd z12.s, z12.s, z30.s\n" + "and z3.d, z8.d, z20.d\n" + "asr z3.s, z3.s, #0x1f\n" + "add z16.s, z16.s, z14.s\n" + "smax z11.s, p4/M, z11.s, z5.s\n" + ".inst 0x448293b3 // srshl z19.s, p4/M, z19.s, z29.s\n" + ".inst 0x44829289 // srshl z9.s, p4/M, z9.s, z20.s\n" + "smin z16.s, p4/M, z16.s, z15.s\n" + ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n" + "add z19.s, z19.s, z14.s\n" + "add z9.s, z9.s, z14.s\n" + "sqadd z8.s, z8.s, z3.s\n" + "add z7.s, z7.s, z14.s\n" + "smax z16.s, p4/M, z16.s, z5.s\n" + "smin z19.s, p4/M, z19.s, z15.s\n" + "smin z9.s, p4/M, z9.s, z15.s\n" + "smin z7.s, p4/M, z7.s, z15.s\n" + "trn1 z11.h, z11.h, z16.h\n" + "st1b { z11.h }, p0, [x7, x3]\n" + "smax z19.s, p4/M, z19.s, z5.s\n" + "smax z9.s, p4/M, z9.s, z5.s\n" + "smax z7.s, p4/M, z7.s, z5.s\n" + ".inst 0x44829286 // srshl z6.s, p4/M, z6.s, z20.s\n" + ".inst 0x448293ac // srshl z12.s, p4/M, z12.s, z29.s\n" + "trn1 z19.h, z19.h, z9.h\n" + "st1b { z19.h }, p0, [x8, x3]\n" + "add z6.s, z6.s, z14.s\n" + ".inst 0x44829288 // srshl z8.s, p4/M, z8.s, z20.s\n" + "add z12.s, z12.s, z14.s\n" + "smin z6.s, p4/M, z6.s, z15.s\n" + "add z8.s, z8.s, z14.s\n" + "smin z12.s, p4/M, z12.s, z15.s\n" + "smax z6.s, p4/M, z6.s, z5.s\n" + "smin z8.s, p4/M, z8.s, z15.s\n" + "smax z12.s, p4/M, z12.s, z5.s\n" + "trn1 z7.h, z7.h, z6.h\n" + "st1b { z7.h }, p0, [x17, x3]\n" + "smax z8.s, p4/M, z8.s, z5.s\n" + "trn1 z12.h, z12.h, z8.h\n" + "st1b { z12.h }, p0, [x16, x3]\n" + "inch x3\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z19.s }, p2/Z, [x19]\n" + "ld1w { z6.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z11.s, z19.s, z6.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z16.s, z19.s, z6.s\n" + "mov z19.d, z11.d\n" + "ld1b { z0.h }, p4/Z, [x1]\n" + ".inst 0x454d1800 // usublb z0.h, z0.b, z13.b\n" + "mov z9.d, z16.d\n" + "ld1b { z1.h }, p4/Z, [x1, #1, MUL VL]\n" + "mov z7.d, z11.d\n" + "ld1b { z2.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x454d1821 // usublb z1.h, z1.b, z13.b\n" + "mov z6.d, z16.d\n" + "ld1b { z3.h }, p4/Z, [x1, #3, MUL VL]\n" + "mov z12.d, z11.d\n" + "ld1b { z4.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n" + "mov z8.d, z16.d\n" + "ldp x28, x27, [x5, #0x0]\n" + ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n" + "ldp x26, x25, [x5, #0x10]\n" + ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n" + "ldp x24, x23, [x5, #0x20]\n" + "ldp x22, x21, [x5, #0x30]\n" + "ldp x20, x19, [x5, #0x40]\n" + "ld1b { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x45511bff // usublb z31.h, z31.b, z17.b\n" + "ld1b { z30.h }, p3/Z, [x27, x2]\n" + "ld1b { z29.h }, p3/Z, [x26, x2]\n" + ".inst 0x45511bde // usublb z30.h, z30.b, z17.b\n" + "ld1b { z28.h }, p3/Z, [x25, x2]\n" + "ld1b { z27.h }, p3/Z, [x24, x2]\n" + ".inst 0x45511bbd // usublb z29.h, z29.b, z17.b\n" + "ld1b { z23.h }, p3/Z, [x23, x2]\n" + ".inst 0x45511b9c // usublb z28.h, z28.b, z17.b\n" + "ld1b { z25.h }, p3/Z, [x22, x2]\n" + "ld1b { z24.h }, p3/Z, [x21, x2]\n" + ".inst 0x45511b7b // usublb z27.h, z27.b, z17.b\n" + "ld1b { z26.h }, p3/Z, [x20, x2]\n" + ".inst 0x45511af7 // usublb z23.h, z23.b, z17.b\n" + "ld1b { z22.h }, p3/Z, [x19, x2]\n" + ".inst 0x45511b39 // usublb z25.h, z25.b, z17.b\n" + ".inst 0x45511b18 // usublb z24.h, z24.b, z17.b\n" + ".inst 0x45511b5a // usublb z26.h, z26.b, z17.b\n" + ".inst 0x45511ad6 // usublb z22.h, z22.b, z17.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp new file mode 100644 index 0000000000..9226a96662 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +{ + typedef uint32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 4; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 9; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + + sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..bb9931c20f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov z31.s, #0x0\n" + "ldr x24, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "mov z18.s, #0x0\n" + "ldr x23, [%x[inptrs], #0x8]\n" + "lsl x9, %x[n_channels], #0x2\n" + "mov z29.s, #0x0\n" + "ldr x22, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "mov z28.s, #0x0\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "mov x19, #0x9\n" + "mov z13.s, #0x0\n" + "ldr x20, [%x[inptrs], #0x20]\n" + "whilelt p1.b, XZR, x19\n" + "mov z14.s, #0x0\n" + "ld1b { z7.b }, p1/Z, [x24]\n" + "mov x19, #0x3\n" + "mov z15.s, #0x0\n" + "ld1b { z3.b }, p1/Z, [x23]\n" + "whilelt p0.b, XZR, x19\n" + "mov z11.b, p0/z, #0x1\n" + "ld1b { z4.b }, p1/Z, [x22]\n" + "mov x28, #0x0\n" + "mov z10.d, z7.d\n" + "ld1b { z6.b }, p1/Z, [x21]\n" + "mov x27, #0x0\n" + "ext z10.b, z10.b, z10.b, #0x2\n" + "ld1b { z5.b }, p1/Z, [x20]\n" + "whilelt p1.b, x28, x9\n" + "mov z17.d, z7.d\n" + "ld1rw { z30.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n" + "mov z26.d, z7.d\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "ext z26.b, z26.b, z26.b, #0x6\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "mov z19.d, z3.d\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "ext z19.b, z19.b, z19.b, #0x2\n" + "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "zip1 z7.s, z7.s, z17.s\n" + "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "zip1 z10.s, z10.s, z26.s\n" + "ld1rw { z0.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "zip1 z7.s, z7.s, z10.s\n" + "ld1w { z1.s }, p1/Z, [%x[params]]\n" + "mov z7.q, z7.q[0]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n" + "mov z17.d, z3.d\n" + "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n" + "addvl %x[params], %x[params], #4\n" + "mov z2.d, z3.d\n" + "mov z20.d, z4.d\n" + "ext z2.b, z2.b, z2.b, #0x6\n" + "zip1 z3.s, z3.s, z17.s\n" + "ext z20.b, z20.b, z20.b, #0x2\n" + "mov z17.d, z4.d\n" + "zip1 z19.s, z19.s, z2.s\n" + "zip1 z3.s, z3.s, z19.s\n" + "mov z3.q, z3.q[0]\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "mov z26.d, z4.d\n" + "ext z26.b, z26.b, z26.b, #0x6\n" + "mov z21.d, z6.d\n" + "zip1 z4.s, z4.s, z17.s\n" + "ext z21.b, z21.b, z21.b, #0x2\n" + "zip1 z20.s, z20.s, z26.s\n" + "zip1 z4.s, z4.s, z20.s\n" + "mov z4.q, z4.q[0]\n" + "mov z17.d, z6.d\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "mov z20.d, z6.d\n" + "ext z20.b, z20.b, z20.b, #0x6\n" + "mov z19.d, z5.d\n" + "zip1 z6.s, z6.s, z17.s\n" + "ext z19.b, z19.b, z19.b, #0x2\n" + "zip1 z21.s, z21.s, z20.s\n" + "zip1 z6.s, z6.s, z21.s\n" + "mov z6.q, z6.q[0]\n" + "mov z17.d, z5.d\n" + "ext z17.b, z17.b, z17.b, #0x4\n" + "mov z20.d, z5.d\n" + "ext z20.b, z20.b, z20.b, #0x6\n" + "mov z11.s, z11.s[0]\n" + "zip1 z5.s, z5.s, z17.s\n" + "mov z25.s, #0x0\n" + "zip1 z19.s, z19.s, z20.s\n" + "zip1 z5.s, z5.s, z19.s\n" + "mov z5.q, z5.q[0]\n" + "mov z26.s, #0x0\n" + "mov z27.s, #0x0\n" + "mov z24.s, #0x0\n" + "mov z23.s, #0x0\n" + "mov z22.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z20.s, #0x0\n" + "mov z2.s, #0x0\n" + "mov z19.s, #0x0\n" + "udot z31.s, z11.b, z7.b[0]\n" + "udot z18.s, z11.b, z7.b[1]\n" + "udot z29.s, z11.b, z7.b[2]\n" + "udot z28.s, z11.b, z7.b[3]\n" + "udot z13.s, z11.b, z3.b[0]\n" + "udot z14.s, z11.b, z3.b[1]\n" + "udot z15.s, z11.b, z3.b[2]\n" + "udot z25.s, z11.b, z3.b[3]\n" + "udot z26.s, z11.b, z4.b[0]\n" + "udot z27.s, z11.b, z4.b[1]\n" + "udot z24.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z4.b[3]\n" + "udot z22.s, z11.b, z6.b[0]\n" + "udot z21.s, z11.b, z6.b[1]\n" + "udot z17.s, z11.b, z6.b[2]\n" + "udot z20.s, z11.b, z6.b[3]\n" + "udot z2.s, z11.b, z5.b[0]\n" + "udot z19.s, z11.b, z5.b[1]\n" + "mov z31.d, z31.d\n" + "mov z18.d, z18.d\n" + "mov z29.d, z29.d\n" + "mov z28.d, z28.d\n" + "add z31.s, z31.s, z13.s\n" + "mov z13.s, #0x0\n" + "udot z13.s, z11.b, z5.b[2]\n" + "add z18.s, z18.s, z14.s\n" + "mov z14.s, #0x0\n" + "udot z14.s, z11.b, z5.b[3]\n" + "add z29.s, z29.s, z15.s\n" + "add z28.s, z28.s, z25.s\n" + "add z31.s, z31.s, z26.s\n" + "add z18.s, z18.s, z27.s\n" + "add z29.s, z29.s, z24.s\n" + "add z28.s, z28.s, z23.s\n" + "mov z26.d, z26.d\n" + "mov z25.d, z27.d\n" + "mov z24.d, z24.d\n" + "mov z23.d, z23.d\n" + "add z26.s, z26.s, z22.s\n" + "add z25.s, z25.s, z21.s\n" + "add z24.s, z24.s, z17.s\n" + "add z23.s, z23.s, z20.s\n" + "add z26.s, z26.s, z2.s\n" + "add z25.s, z25.s, z19.s\n" + "add z24.s, z24.s, z13.s\n" + "add z23.s, z23.s, z14.s\n" + "neg z30.s, p2/M, z30.s\n" + "mul z31.s, p2/M, z31.s, z30.s\n" + "st1w { z31.s }, p2, [SP]\n" + "add z31.s, z31.s, z1.s\n" + "mul z18.s, p2/M, z18.s, z30.s\n" + "st1w { z18.s }, p2, [SP, #1, MUL VL]\n" + "add z18.s, z18.s, z1.s\n" + "mul z29.s, p2/M, z29.s, z30.s\n" + "st1w { z29.s }, p2, [SP, #2, MUL VL]\n" + "add z29.s, z29.s, z1.s\n" + "mul z28.s, p2/M, z28.s, z30.s\n" + "st1w { z28.s }, p2, [SP, #3, MUL VL]\n" + "add z28.s, z28.s, z1.s\n" + "mul z26.s, p2/M, z26.s, z30.s\n" + "st1w { z26.s }, p2, [SP, #4, MUL VL]\n" + "add z26.s, z26.s, z1.s\n" + "mul z25.s, p2/M, z25.s, z30.s\n" + "st1w { z25.s }, p2, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z1.s\n" + "mul z24.s, p2/M, z24.s, z30.s\n" + "st1w { z24.s }, p2, [SP, #6, MUL VL]\n" + "add z24.s, z24.s, z1.s\n" + "mul z23.s, p2/M, z23.s, z30.s\n" + "st1w { z23.s }, p2, [SP, #7, MUL VL]\n" + "add z23.s, z23.s, z1.s\n" + "1:" // Loop + "udot z31.s, z8.b, z7.b[0]\n" + "ld1w { z22.s }, p2/Z, [%x[params]]\n" + "incb x28\n" + "udot z18.s, z8.b, z7.b[1]\n" + "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n" + "whilelt p0.s, x27, %x[n_channels]\n" + "udot z29.s, z8.b, z7.b[2]\n" + "whilelt p1.b, x28, x9\n" + "ld1w { z1.s }, p1/Z, [%x[params], #2, MUL VL]\n" + "udot z28.s, z8.b, z7.b[3]\n" + "udot z26.s, z8.b, z4.b[0]\n" + "udot z25.s, z8.b, z4.b[1]\n" + "udot z24.s, z8.b, z4.b[2]\n" + "udot z23.s, z8.b, z4.b[3]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #3, MUL VL]\n" + "udot z31.s, z9.b, z3.b[0]\n" + "udot z18.s, z9.b, z3.b[1]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "udot z28.s, z9.b, z3.b[3]\n" + "udot z26.s, z9.b, z6.b[0]\n" + "udot z25.s, z9.b, z6.b[1]\n" + "udot z24.s, z9.b, z6.b[2]\n" + "udot z23.s, z9.b, z6.b[3]\n" + "ld1b { z9.b }, p1/Z, [%x[params], #4, MUL VL]\n" + "udot z31.s, z10.b, z4.b[0]\n" + "udot z18.s, z10.b, z4.b[1]\n" + "udot z29.s, z10.b, z4.b[2]\n" + "udot z28.s, z10.b, z4.b[3]\n" + "udot z26.s, z10.b, z5.b[0]\n" + "udot z25.s, z10.b, z5.b[1]\n" + "udot z24.s, z10.b, z5.b[2]\n" + "udot z23.s, z10.b, z5.b[3]\n" + "ld1b { z10.b }, p1/Z, [%x[params], #5, MUL VL]\n" + "addvl %x[params], %x[params], #6\n" + ".inst 0x04b677ff // sqrdmulh z31.s, z31.s, z22.s\n" + ".inst 0x04b67652 // sqrdmulh z18.s, z18.s, z22.s\n" + ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n" + ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n" + ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n" + "and z20.d, z31.d, z21.d\n" + "asr z20.s, z20.s, #0x1f\n" + "and z19.d, z18.d, z21.d\n" + "and z14.d, z29.d, z21.d\n" + "asr z19.s, z19.s, #0x1f\n" + "and z17.d, z28.d, z21.d\n" + "and z2.d, z26.d, z21.d\n" + "asr z14.s, z14.s, #0x1f\n" + ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z31.s, z31.s, z20.s\n" + ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n" + "asr z2.s, z2.s, #0x1f\n" + ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n" + "sqadd z18.s, z18.s, z19.s\n" + "sqadd z29.s, z29.s, z14.s\n" + "and z27.d, z25.d, z21.d\n" + "asr z27.s, z27.s, #0x1f\n" + "sqadd z28.s, z28.s, z17.s\n" + "sqadd z26.s, z26.s, z2.s\n" + "and z17.d, z24.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "and z15.d, z23.d, z21.d\n" + ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n" + "asr z15.s, z15.s, #0x1f\n" + "sqadd z25.s, z25.s, z27.s\n" + ".inst 0x44828ab2 // srshl z18.s, p2/M, z18.s, z21.s\n" + "add z31.s, z31.s, z12.s\n" + "sqadd z24.s, z24.s, z17.s\n" + ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n" + "add z18.s, z18.s, z12.s\n" + "sqadd z23.s, z23.s, z15.s\n" + "smin z31.s, p2/M, z31.s, z0.s\n" + "add z29.s, z29.s, z12.s\n" + "smin z18.s, p2/M, z18.s, z0.s\n" + ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n" + "smax z31.s, p2/M, z31.s, z16.s\n" + "st1b { z31.s }, p0, [x26, x27]\n" + "add z28.s, z28.s, z12.s\n" + "smax z18.s, p2/M, z18.s, z16.s\n" + "ld1w { z31.s }, p2/Z, [SP]\n" + "smin z29.s, p2/M, z29.s, z0.s\n" + "st1b { z18.s }, p0, [x25, x27]\n" + "add z31.s, z31.s, z1.s\n" + "smin z28.s, p2/M, z28.s, z0.s\n" + "ld1w { z18.s }, p2/Z, [SP, #1, MUL VL]\n" + "smax z29.s, p2/M, z29.s, z16.s\n" + "st1b { z29.s }, p0, [x24, x27]\n" + "add z18.s, z18.s, z1.s\n" + "smax z28.s, p2/M, z28.s, z16.s\n" + "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n" + "st1b { z28.s }, p0, [x23, x27]\n" + "add z29.s, z29.s, z1.s\n" + ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n" + "ld1w { z28.s }, p2/Z, [SP, #3, MUL VL]\n" + "add z26.s, z26.s, z12.s\n" + ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n" + ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n" + "add z25.s, z25.s, z12.s\n" + "add z28.s, z28.s, z1.s\n" + "add z24.s, z24.s, z12.s\n" + "add z23.s, z23.s, z12.s\n" + "smin z26.s, p2/M, z26.s, z0.s\n" + "smin z25.s, p2/M, z25.s, z0.s\n" + "smin z24.s, p2/M, z24.s, z0.s\n" + "smin z23.s, p2/M, z23.s, z0.s\n" + "smax z26.s, p2/M, z26.s, z16.s\n" + "st1b { z26.s }, p0, [x22, x27]\n" + "smax z25.s, p2/M, z25.s, z16.s\n" + "smax z24.s, p2/M, z24.s, z16.s\n" + "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n" + "smax z23.s, p2/M, z23.s, z16.s\n" + "st1b { z25.s }, p0, [x21, x27]\n" + "add z26.s, z26.s, z1.s\n" + "st1b { z24.s }, p0, [x20, x27]\n" + "st1b { z23.s }, p0, [x19, x27]\n" + "incw x27\n" + "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z1.s\n" + "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n" + "add z24.s, z24.s, z1.s\n" + "add z23.s, z23.s, z1.s\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp new file mode 100644 index 0000000000..3023ed16e5 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include + +#pragma once + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + +struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +{ + typedef uint32_t bias_type; + typedef uint8_t input_type; + typedef uint8_t weight_type; + typedef uint8_t return_type; + + typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 4; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 8; + constexpr static unsigned int input_cols = 6; + constexpr static unsigned int input_col_quads = 1; + + kern_type kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + + sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp new file mode 100644 index 0000000000..fc1e23e897 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "arm_gemm.hpp" +#include +#include + +#if defined(__ARM_FEATURE_SVE) + +namespace arm_conv { +namespace depthwise { + +void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl( + const uint8_t *const *const inptrs, + uint8_t *const *const outptrs, + const void *params, + unsigned int n_output_channels, + const arm_gemm::Requantize32& qp +) +{ + __asm__ __volatile__( + "mov z20.b, #0x1\n" + "ldr x24, [%x[inptrs], #0x0]\n" + "ptrue p2.b\n" + "mov z22.s, #0x1\n" + "ldr x23, [%x[inptrs], #0x8]\n" + "lsl x9, %x[n_channels], #0x2\n" + "mov z30.s, #0x0\n" + "ldr x22, [%x[inptrs], #0x10]\n" + "addvl SP, SP, #-8\n" + "mov z28.s, #0x0\n" + "ldr x21, [%x[inptrs], #0x18]\n" + "mov x20, #0x6\n" + "mov z29.s, #0x0\n" + "ldr x19, [%x[inptrs], #0x20]\n" + "whilelt p0.b, XZR, x20\n" + "mov z27.s, #0x0\n" + "ld1b { z0.b }, p0/Z, [x24]\n" + "mov x28, #0x0\n" + "mov z26.s, #0x0\n" + "ld1b { z3.b }, p0/Z, [x23]\n" + "mov x27, #0x0\n" + "mov z25.s, #0x0\n" + "ld1b { z5.b }, p0/Z, [x22]\n" + "whilelt p1.b, x28, x9\n" + "mov z15.d, z0.d\n" + "ld1b { z4.b }, p0/Z, [x21]\n" + "mov z24.s, #0x0\n" + "ld1b { z6.b }, p0/Z, [x19]\n" + "ext z15.b, z15.b, z15.b, #0x1\n" + "ldr x21, [%x[inptrs], #0x28]\n" + "mov z16.d, z3.d\n" + "ldr x20, [%x[inptrs], #0x30]\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "ldr x19, [%x[inptrs], #0x38]\n" + "mov z18.d, z5.d\n" + "ld1b { z7.b }, p0/Z, [x21]\n" + "zip1 z0.d, z0.d, z15.d\n" + "ld1b { z1.b }, p0/Z, [x20]\n" + "mov z0.q, z0.q[0]\n" + "ld1b { z2.b }, p0/Z, [x19]\n" + "zip1 z3.d, z3.d, z16.d\n" + "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n" + "mov z3.q, z3.q[0]\n" + "ldp x26, x25, [%x[outptrs], #0x0]\n" + "ext z18.b, z18.b, z18.b, #0x1\n" + "ldp x24, x23, [%x[outptrs], #0x10]\n" + "mov z16.d, z4.d\n" + "ldp x22, x21, [%x[outptrs], #0x20]\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "ldp x20, x19, [%x[outptrs], #0x30]\n" + "mov z17.d, z6.d\n" + "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n" + "zip1 z5.d, z5.d, z18.d\n" + "ld1rw { z31.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n" + "mov z5.q, z5.q[0]\n" + "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n" + "zip1 z4.d, z4.d, z16.d\n" + "ld1w { z13.s }, p1/Z, [%x[params]]\n" + "mov z4.q, z4.q[0]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #1, MUL VL]\n" + "ext z17.b, z17.b, z17.b, #0x1\n" + "ld1b { z9.b }, p1/Z, [%x[params], #2, MUL VL]\n" + "mov z16.d, z7.d\n" + "ld1b { z10.b }, p1/Z, [%x[params], #3, MUL VL]\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "ld1b { z11.b }, p1/Z, [%x[params], #4, MUL VL]\n" + "addvl %x[params], %x[params], #5\n" + "zip1 z6.d, z6.d, z17.d\n" + "mov z17.d, z1.d\n" + "mov z6.q, z6.q[0]\n" + "zip1 z7.d, z7.d, z16.d\n" + "mov z7.q, z7.q[0]\n" + "ext z17.b, z17.b, z17.b, #0x1\n" + "mov z16.d, z2.d\n" + "ext z16.b, z16.b, z16.b, #0x1\n" + "mov z23.s, #0x0\n" + "zip1 z1.d, z1.d, z17.d\n" + "mov z1.q, z1.q[0]\n" + "zip1 z2.d, z2.d, z16.d\n" + "mov z2.q, z2.q[0]\n" + "mov z18.s, #0x0\n" + "mov z17.s, #0x0\n" + "mov z16.s, #0x0\n" + "mov z21.s, #0x0\n" + "mov z19.s, #0x0\n" + "udot z30.s, z20.b, z0.b[0]\n" + "udot z28.s, z20.b, z0.b[2]\n" + "udot z29.s, z20.b, z3.b[0]\n" + "udot z27.s, z20.b, z3.b[2]\n" + "udot z30.s, z22.b, z0.b[1]\n" + "udot z28.s, z22.b, z0.b[3]\n" + "udot z29.s, z22.b, z3.b[1]\n" + "udot z27.s, z22.b, z3.b[3]\n" + "udot z26.s, z20.b, z5.b[0]\n" + "udot z25.s, z20.b, z5.b[2]\n" + "udot z24.s, z20.b, z4.b[0]\n" + "udot z23.s, z20.b, z4.b[2]\n" + "udot z26.s, z22.b, z5.b[1]\n" + "udot z25.s, z22.b, z5.b[3]\n" + "udot z24.s, z22.b, z4.b[1]\n" + "udot z23.s, z22.b, z4.b[3]\n" + "udot z18.s, z20.b, z6.b[0]\n" + "udot z17.s, z20.b, z6.b[2]\n" + "udot z16.s, z20.b, z7.b[0]\n" + "udot z21.s, z20.b, z7.b[2]\n" + "udot z18.s, z22.b, z6.b[1]\n" + "udot z17.s, z22.b, z6.b[3]\n" + "udot z16.s, z22.b, z7.b[1]\n" + "udot z21.s, z22.b, z7.b[3]\n" + "udot z19.s, z20.b, z1.b[0]\n" + "mov z30.d, z30.d\n" + "mov z28.d, z28.d\n" + "add z30.s, z30.s, z29.s\n" + "udot z19.s, z22.b, z1.b[1]\n" + "add z28.s, z28.s, z27.s\n" + "add z30.s, z30.s, z26.s\n" + "mov z29.d, z29.d\n" + "add z28.s, z28.s, z25.s\n" + "add z30.s, z30.s, z24.s\n" + "mov z27.d, z27.d\n" + "add z28.s, z28.s, z23.s\n" + "add z30.s, z30.s, z18.s\n" + "add z29.s, z29.s, z26.s\n" + "add z28.s, z28.s, z17.s\n" + "add z27.s, z27.s, z25.s\n" + "add z29.s, z29.s, z24.s\n" + "mov z26.d, z26.d\n" + "add z27.s, z27.s, z23.s\n" + "add z29.s, z29.s, z18.s\n" + "mov z25.d, z25.d\n" + "add z27.s, z27.s, z17.s\n" + "add z29.s, z29.s, z16.s\n" + "add z26.s, z26.s, z24.s\n" + "add z27.s, z27.s, z21.s\n" + "add z25.s, z25.s, z23.s\n" + "add z26.s, z26.s, z18.s\n" + "mov z24.d, z24.d\n" + "add z25.s, z25.s, z17.s\n" + "add z26.s, z26.s, z16.s\n" + "mov z23.d, z23.d\n" + "add z25.s, z25.s, z21.s\n" + "add z26.s, z26.s, z19.s\n" + "add z24.s, z24.s, z18.s\n" + "mov z18.s, #0x0\n" + "udot z18.s, z20.b, z1.b[2]\n" + "add z23.s, z23.s, z17.s\n" + "mov z17.s, #0x0\n" + "udot z17.s, z20.b, z2.b[0]\n" + "udot z18.s, z22.b, z1.b[3]\n" + "add z24.s, z24.s, z16.s\n" + "mov z16.s, #0x0\n" + "udot z17.s, z22.b, z2.b[1]\n" + "udot z16.s, z20.b, z2.b[2]\n" + "add z25.s, z25.s, z18.s\n" + "add z23.s, z23.s, z21.s\n" + "add z24.s, z24.s, z19.s\n" + "udot z16.s, z22.b, z2.b[3]\n" + "add z23.s, z23.s, z18.s\n" + "add z24.s, z24.s, z17.s\n" + "neg z15.s, p2/M, z15.s\n" + "add z23.s, z23.s, z16.s\n" + "mul z30.s, p2/M, z30.s, z15.s\n" + "st1w { z30.s }, p2, [SP]\n" + "add z30.s, z30.s, z13.s\n" + "mul z28.s, p2/M, z28.s, z15.s\n" + "st1w { z28.s }, p2, [SP, #1, MUL VL]\n" + "add z28.s, z28.s, z13.s\n" + "mul z29.s, p2/M, z29.s, z15.s\n" + "st1w { z29.s }, p2, [SP, #2, MUL VL]\n" + "add z29.s, z29.s, z13.s\n" + "mul z27.s, p2/M, z27.s, z15.s\n" + "st1w { z27.s }, p2, [SP, #3, MUL VL]\n" + "add z27.s, z27.s, z13.s\n" + "mul z26.s, p2/M, z26.s, z15.s\n" + "st1w { z26.s }, p2, [SP, #4, MUL VL]\n" + "add z26.s, z26.s, z13.s\n" + "mul z25.s, p2/M, z25.s, z15.s\n" + "st1w { z25.s }, p2, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z13.s\n" + "mul z24.s, p2/M, z24.s, z15.s\n" + "st1w { z24.s }, p2, [SP, #6, MUL VL]\n" + "add z24.s, z24.s, z13.s\n" + "mul z23.s, p2/M, z23.s, z15.s\n" + "st1w { z23.s }, p2, [SP, #7, MUL VL]\n" + "add z23.s, z23.s, z13.s\n" + "1:" // Loop + "udot z30.s, z8.b, z0.b[0]\n" + "ld1w { z22.s }, p2/Z, [%x[params], #6, MUL VL]\n" + "incb x28\n" + "udot z28.s, z8.b, z0.b[2]\n" + "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n" + "whilelt p0.s, x27, %x[n_channels]\n" + "udot z29.s, z8.b, z3.b[0]\n" + "whilelt p1.b, x28, x9\n" + "udot z27.s, z8.b, z3.b[2]\n" + "udot z26.s, z8.b, z5.b[0]\n" + "udot z25.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z4.b[0]\n" + "udot z23.s, z8.b, z4.b[2]\n" + "ld1b { z8.b }, p2/Z, [%x[params]]\n" + "udot z30.s, z9.b, z0.b[1]\n" + "udot z28.s, z9.b, z0.b[3]\n" + "udot z29.s, z9.b, z3.b[1]\n" + "udot z27.s, z9.b, z3.b[3]\n" + "udot z26.s, z9.b, z5.b[1]\n" + "udot z25.s, z9.b, z5.b[3]\n" + "udot z24.s, z9.b, z4.b[1]\n" + "udot z23.s, z9.b, z4.b[3]\n" + "ld1b { z9.b }, p2/Z, [%x[params], #1, MUL VL]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "udot z28.s, z10.b, z3.b[2]\n" + "udot z29.s, z10.b, z5.b[0]\n" + "udot z27.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z4.b[0]\n" + "udot z25.s, z10.b, z4.b[2]\n" + "udot z24.s, z10.b, z6.b[0]\n" + "udot z23.s, z10.b, z6.b[2]\n" + "ld1b { z10.b }, p2/Z, [%x[params], #2, MUL VL]\n" + "udot z30.s, z11.b, z3.b[1]\n" + "udot z28.s, z11.b, z3.b[3]\n" + "udot z29.s, z11.b, z5.b[1]\n" + "udot z27.s, z11.b, z5.b[3]\n" + "udot z26.s, z11.b, z4.b[1]\n" + "udot z25.s, z11.b, z4.b[3]\n" + "udot z24.s, z11.b, z6.b[1]\n" + "udot z23.s, z11.b, z6.b[3]\n" + "ld1b { z11.b }, p2/Z, [%x[params], #3, MUL VL]\n" + "udot z30.s, z8.b, z5.b[0]\n" + "udot z28.s, z8.b, z5.b[2]\n" + "udot z29.s, z8.b, z4.b[0]\n" + "udot z27.s, z8.b, z4.b[2]\n" + "udot z26.s, z8.b, z6.b[0]\n" + "udot z25.s, z8.b, z6.b[2]\n" + "udot z24.s, z8.b, z7.b[0]\n" + "udot z23.s, z8.b, z7.b[2]\n" + "ld1b { z8.b }, p2/Z, [%x[params], #4, MUL VL]\n" + "udot z30.s, z9.b, z5.b[1]\n" + "udot z28.s, z9.b, z5.b[3]\n" + "udot z29.s, z9.b, z4.b[1]\n" + "udot z27.s, z9.b, z4.b[3]\n" + "udot z26.s, z9.b, z6.b[1]\n" + "udot z25.s, z9.b, z6.b[3]\n" + "udot z24.s, z9.b, z7.b[1]\n" + "udot z23.s, z9.b, z7.b[3]\n" + "ld1b { z9.b }, p2/Z, [%x[params], #5, MUL VL]\n" + "addvl %x[params], %x[params], #16\n" + "udot z30.s, z10.b, z4.b[0]\n" + "ld1w { z13.s }, p1/Z, [%x[params], #-8, MUL VL]\n" + "udot z28.s, z10.b, z4.b[2]\n" + "udot z29.s, z10.b, z6.b[0]\n" + "udot z27.s, z10.b, z6.b[2]\n" + "udot z26.s, z10.b, z7.b[0]\n" + "udot z25.s, z10.b, z7.b[2]\n" + "udot z24.s, z10.b, z1.b[0]\n" + "udot z23.s, z10.b, z1.b[2]\n" + "ld1b { z10.b }, p1/Z, [%x[params], #-5, MUL VL]\n" + "udot z30.s, z11.b, z4.b[1]\n" + "udot z28.s, z11.b, z4.b[3]\n" + "udot z29.s, z11.b, z6.b[1]\n" + "udot z27.s, z11.b, z6.b[3]\n" + "udot z26.s, z11.b, z7.b[1]\n" + "udot z25.s, z11.b, z7.b[3]\n" + "udot z24.s, z11.b, z1.b[1]\n" + "udot z23.s, z11.b, z1.b[3]\n" + "ld1b { z11.b }, p1/Z, [%x[params], #-4, MUL VL]\n" + "udot z30.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z6.b[2]\n" + "udot z29.s, z8.b, z7.b[0]\n" + "udot z27.s, z8.b, z7.b[2]\n" + "udot z26.s, z8.b, z1.b[0]\n" + "udot z25.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z23.s, z8.b, z2.b[2]\n" + "ld1b { z8.b }, p1/Z, [%x[params], #-7, MUL VL]\n" + "udot z30.s, z9.b, z6.b[1]\n" + "udot z28.s, z9.b, z6.b[3]\n" + "udot z29.s, z9.b, z7.b[1]\n" + "udot z27.s, z9.b, z7.b[3]\n" + "udot z26.s, z9.b, z1.b[1]\n" + "udot z25.s, z9.b, z1.b[3]\n" + "udot z24.s, z9.b, z2.b[1]\n" + "udot z23.s, z9.b, z2.b[3]\n" + "ld1b { z9.b }, p1/Z, [%x[params], #-6, MUL VL]\n" + "addvl %x[params], %x[params], #-3\n" + ".inst 0x04b677de // sqrdmulh z30.s, z30.s, z22.s\n" + ".inst 0x04b6779c // sqrdmulh z28.s, z28.s, z22.s\n" + ".inst 0x04b677bd // sqrdmulh z29.s, z29.s, z22.s\n" + ".inst 0x04b6777b // sqrdmulh z27.s, z27.s, z22.s\n" + ".inst 0x04b6775a // sqrdmulh z26.s, z26.s, z22.s\n" + "and z20.d, z30.d, z21.d\n" + "asr z20.s, z20.s, #0x1f\n" + "and z19.d, z28.d, z21.d\n" + "and z18.d, z29.d, z21.d\n" + "asr z19.s, z19.s, #0x1f\n" + "and z17.d, z27.d, z21.d\n" + "and z16.d, z26.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + ".inst 0x04b67739 // sqrdmulh z25.s, z25.s, z22.s\n" + "asr z17.s, z17.s, #0x1f\n" + "sqadd z30.s, z30.s, z20.s\n" + ".inst 0x04b67718 // sqrdmulh z24.s, z24.s, z22.s\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b676f7 // sqrdmulh z23.s, z23.s, z22.s\n" + "sqadd z28.s, z28.s, z19.s\n" + "sqadd z29.s, z29.s, z18.s\n" + "and z18.d, z25.d, z21.d\n" + "asr z18.s, z18.s, #0x1f\n" + "sqadd z27.s, z27.s, z17.s\n" + "sqadd z26.s, z26.s, z16.s\n" + "and z17.d, z24.d, z21.d\n" + "asr z17.s, z17.s, #0x1f\n" + "and z16.d, z23.d, z21.d\n" + ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z25.s, z25.s, z18.s\n" + ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n" + "add z30.s, z30.s, z14.s\n" + "sqadd z24.s, z24.s, z17.s\n" + ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n" + "add z28.s, z28.s, z14.s\n" + "sqadd z23.s, z23.s, z16.s\n" + "smin z30.s, p2/M, z30.s, z12.s\n" + "add z29.s, z29.s, z14.s\n" + "smin z28.s, p2/M, z28.s, z12.s\n" + ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n" + "smax z30.s, p2/M, z30.s, z31.s\n" + "st1b { z30.s }, p0, [x26, x27]\n" + "add z27.s, z27.s, z14.s\n" + "smax z28.s, p2/M, z28.s, z31.s\n" + "ld1w { z30.s }, p2/Z, [SP]\n" + "smin z29.s, p2/M, z29.s, z12.s\n" + "st1b { z28.s }, p0, [x25, x27]\n" + "add z30.s, z30.s, z13.s\n" + "smin z27.s, p2/M, z27.s, z12.s\n" + "ld1w { z28.s }, p2/Z, [SP, #1, MUL VL]\n" + "smax z29.s, p2/M, z29.s, z31.s\n" + "st1b { z29.s }, p0, [x24, x27]\n" + "add z28.s, z28.s, z13.s\n" + "smax z27.s, p2/M, z27.s, z31.s\n" + "ld1w { z29.s }, p2/Z, [SP, #2, MUL VL]\n" + ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n" + "st1b { z27.s }, p0, [x23, x27]\n" + "add z29.s, z29.s, z13.s\n" + ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n" + "ld1w { z27.s }, p2/Z, [SP, #3, MUL VL]\n" + "add z26.s, z26.s, z14.s\n" + ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n" + ".inst 0x44828ab7 // srshl z23.s, p2/M, z23.s, z21.s\n" + "add z25.s, z25.s, z14.s\n" + "add z27.s, z27.s, z13.s\n" + "add z24.s, z24.s, z14.s\n" + "add z23.s, z23.s, z14.s\n" + "smin z26.s, p2/M, z26.s, z12.s\n" + "smin z25.s, p2/M, z25.s, z12.s\n" + "smin z24.s, p2/M, z24.s, z12.s\n" + "smin z23.s, p2/M, z23.s, z12.s\n" + "smax z26.s, p2/M, z26.s, z31.s\n" + "st1b { z26.s }, p0, [x22, x27]\n" + "smax z25.s, p2/M, z25.s, z31.s\n" + "smax z24.s, p2/M, z24.s, z31.s\n" + "ld1w { z26.s }, p2/Z, [SP, #4, MUL VL]\n" + "smax z23.s, p2/M, z23.s, z31.s\n" + "st1b { z25.s }, p0, [x21, x27]\n" + "add z26.s, z26.s, z13.s\n" + "st1b { z24.s }, p0, [x20, x27]\n" + "st1b { z23.s }, p0, [x19, x27]\n" + "incw x27\n" + "ld1w { z25.s }, p2/Z, [SP, #5, MUL VL]\n" + "add z25.s, z25.s, z13.s\n" + "ld1w { z24.s }, p2/Z, [SP, #6, MUL VL]\n" + "ld1w { z23.s }, p2/Z, [SP, #7, MUL VL]\n" + "add z24.s, z24.s, z13.s\n" + "add z23.s, z23.s, z13.s\n" + "b.any 1b\n" + "addvl SP, SP, #8\n" + : [params] "+&r" (params) + : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__ARM_FEATURE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..361f48bfbe --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 4; + constexpr static unsigned int input_cols = 4; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + + kern_type kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + + sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..4fc8999ea1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "mov x16, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x15, #0x0\n" + "ldr x14, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x13, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z11.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z26.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z12.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z14.s }, p4/Z, [x20]\n" + "whilelt p3.h, x16, x8\n" + "ld1rw { z17.s }, p4/Z, [x19]\n" + "whilelt p2.s, x16, x8\n" + "ldp x11, x10, [x21, #0x0]\n" + "mov x19, x16\n" + "incw x19\n" + "ldp x9, x28, [x21, #0x10]\n" + "whilelt p1.s, x19, x8\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z10.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z10.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z15.s, z10.s, z16.s\n" + "mov z25.d, z13.d\n" + "ld1sb { z0.h }, p4/Z, [x17]\n" + "mov z23.d, z13.d\n" + "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n" + "mov z9.d, z15.d\n" + "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n" + "mov z22.d, z15.d\n" + "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n" + "mov z10.d, z13.d\n" + "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n" + "mov z24.d, z15.d\n" + "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n" + ".inst 0x455a1000 // ssublb z0.h, z0.b, z26.b\n" + "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n" + ".inst 0x455a1021 // ssublb z1.h, z1.b, z26.b\n" + "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n" + "inch x17, ALL, MUL #8\n" + ".inst 0x455a1042 // ssublb z2.h, z2.b, z26.b\n" + "ld1sb { z8.h }, p4/Z, [x17]\n" + ".inst 0x455a1063 // ssublb z3.h, z3.b, z26.b\n" + "ldp x23, x22, [x13, #0x0]\n" + ".inst 0x455a1084 // ssublb z4.h, z4.b, z26.b\n" + "ldp x21, x20, [x13, #0x10]\n" + ".inst 0x455a10a5 // ssublb z5.h, z5.b, z26.b\n" + ".inst 0x455a10c6 // ssublb z6.h, z6.b, z26.b\n" + "ldr x19, [x13, #0x20]\n" + ".inst 0x455a10e7 // ssublb z7.h, z7.b, z26.b\n" + ".inst 0x455a1108 // ssublb z8.h, z8.b, z26.b\n" + "ld1b { z31.h }, p3/Z, [x23, x16]\n" + "ld1b { z30.h }, p3/Z, [x22, x16]\n" + ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n" + "ld1b { z29.h }, p3/Z, [x21, x16]\n" + ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n" + "ld1b { z28.h }, p3/Z, [x20, x16]\n" + "ld1b { z27.h }, p3/Z, [x19, x16]\n" + ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n" + ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n" + ".inst 0x454b1b7b // usublb z27.h, z27.b, z11.b\n" + "1:" // Loop + ".inst 0x448443ed // smlalb z13.s, p4/M, z31.h, z4.h\n" + "ldr x20, [x13, #0x28]\n" + "whilelt p0.h, x15, x8\n" + ".inst 0x448447ef // smlalt z15.s, p4/M, z31.h, z4.h\n" + "ldr x27, [x13, #0x30]\n" + "inch x17\n" + ".inst 0x448343f9 // smlalb z25.s, p4/M, z31.h, z3.h\n" + "ldr x26, [x13, #0x38]\n" + ".inst 0x448347e9 // smlalt z9.s, p4/M, z31.h, z3.h\n" + "ldr x25, [x13, #0x40]\n" + ".inst 0x448143f7 // smlalb z23.s, p4/M, z31.h, z1.h\n" + "ldr x19, [x13, #0x48]\n" + ".inst 0x448147f6 // smlalt z22.s, p4/M, z31.h, z1.h\n" + "ldr x24, [x13, #0x50]\n" + ".inst 0x448043ea // smlalb z10.s, p4/M, z31.h, z0.h\n" + "ldr x23, [x13, #0x58]\n" + ".inst 0x448047f8 // smlalt z24.s, p4/M, z31.h, z0.h\n" + "ld1b { z31.h }, p3/Z, [x20, x16]\n" + ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n" + "ldr x22, [x13, #0x60]\n" + ".inst 0x448047cf // smlalt z15.s, p4/M, z30.h, z0.h\n" + "ld1b { z30.h }, p3/Z, [x19, x16]\n" + ".inst 0x448243b9 // smlalb z25.s, p4/M, z29.h, z2.h\n" + "ldr x21, [x13, #0x68]\n" + ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n" + "ldr x20, [x13, #0x70]\n" + ".inst 0x448247a9 // smlalt z9.s, p4/M, z29.h, z2.h\n" + "ld1b { z29.h }, p3/Z, [x27, x16]\n" + ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n" + "ldr x19, [x13, #0x78]\n" + ".inst 0x4485438d // smlalb z13.s, p4/M, z28.h, z5.h\n" + "ld1w { z19.s }, p2/Z, [x14]\n" + ".inst 0x4485478f // smlalt z15.s, p4/M, z28.h, z5.h\n" + "ld1w { z16.s }, p1/Z, [x14, #1, MUL VL]\n" + "addvl x14, x14, #2\n" + ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n" + ".inst 0x44844399 // smlalb z25.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844789 // smlalt z9.s, p4/M, z28.h, z4.h\n" + "uzp1 z21.s, z19.s, z16.s\n" + "uzp2 z18.s, z19.s, z16.s\n" + "ld1w { z19.s }, p2/Z, [x12]\n" + ".inst 0x44824397 // smlalb z23.s, p4/M, z28.h, z2.h\n" + "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n" + "addvl x12, x12, #2\n" + ".inst 0x44824796 // smlalt z22.s, p4/M, z28.h, z2.h\n" + ".inst 0x4481438a // smlalb z10.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814798 // smlalt z24.s, p4/M, z28.h, z1.h\n" + "ld1b { z28.h }, p3/Z, [x26, x16]\n" + "uzp1 z20.s, z19.s, z16.s\n" + "uzp2 z19.s, z19.s, z16.s\n" + ".inst 0x448643f7 // smlalb z23.s, p4/M, z31.h, z6.h\n" + ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n" + ".inst 0x448647f6 // smlalt z22.s, p4/M, z31.h, z6.h\n" + "ld1b { z31.h }, p3/Z, [x25, x16]\n" + ".inst 0x4487436d // smlalb z13.s, p4/M, z27.h, z7.h\n" + ".inst 0x4487476f // smlalt z15.s, p4/M, z27.h, z7.h\n" + ".inst 0x44864379 // smlalb z25.s, p4/M, z27.h, z6.h\n" + ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n" + ".inst 0x44864769 // smlalt z9.s, p4/M, z27.h, z6.h\n" + ".inst 0x44844377 // smlalb z23.s, p4/M, z27.h, z4.h\n" + ".inst 0x44844776 // smlalt z22.s, p4/M, z27.h, z4.h\n" + ".inst 0x4483436a // smlalb z10.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n" + ".inst 0x4481438d // smlalb z13.s, p4/M, z28.h, z1.h\n" + ".inst 0x4481478f // smlalt z15.s, p4/M, z28.h, z1.h\n" + ".inst 0x448843aa // smlalb z10.s, p4/M, z29.h, z8.h\n" + ".inst 0x448847b8 // smlalt z24.s, p4/M, z29.h, z8.h\n" + "ld1b { z29.h }, p3/Z, [x24, x16]\n" + ".inst 0x44804399 // smlalb z25.s, p4/M, z28.h, z0.h\n" + ".inst 0x44804789 // smlalt z9.s, p4/M, z28.h, z0.h\n" + "ld1b { z28.h }, p3/Z, [x23, x16]\n" + ".inst 0x448243ed // smlalb z13.s, p4/M, z31.h, z2.h\n" + ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n" + ".inst 0x448247ef // smlalt z15.s, p4/M, z31.h, z2.h\n" + ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n" + ".inst 0x448143f9 // smlalb z25.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147e9 // smlalt z9.s, p4/M, z31.h, z1.h\n" + "ld1b { z31.h }, p3/Z, [x22, x16]\n" + ".inst 0x448843cd // smlalb z13.s, p4/M, z30.h, z8.h\n" + ".inst 0x448847cf // smlalt z15.s, p4/M, z30.h, z8.h\n" + ".inst 0x448743d9 // smlalb z25.s, p4/M, z30.h, z7.h\n" + ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n" + ".inst 0x448747c9 // smlalt z9.s, p4/M, z30.h, z7.h\n" + ".inst 0x448543d7 // smlalb z23.s, p4/M, z30.h, z5.h\n" + ".inst 0x448547d6 // smlalt z22.s, p4/M, z30.h, z5.h\n" + ".inst 0x448443ca // smlalb z10.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447d8 // smlalt z24.s, p4/M, z30.h, z4.h\n" + "ld1b { z30.h }, p3/Z, [x21, x16]\n" + ".inst 0x448343ad // smlalb z13.s, p4/M, z29.h, z3.h\n" + ".inst 0x448347af // smlalt z15.s, p4/M, z29.h, z3.h\n" + ".inst 0x448043b7 // smlalb z23.s, p4/M, z29.h, z0.h\n" + ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n" + ".inst 0x448047b6 // smlalt z22.s, p4/M, z29.h, z0.h\n" + "ld1b { z29.h }, p3/Z, [x20, x16]\n" + ".inst 0x44854399 // smlalb z25.s, p4/M, z28.h, z5.h\n" + ".inst 0x44854789 // smlalt z9.s, p4/M, z28.h, z5.h\n" + ".inst 0x4482438a // smlalb z10.s, p4/M, z28.h, z2.h\n" + ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n" + ".inst 0x44824798 // smlalt z24.s, p4/M, z28.h, z2.h\n" + "ld1b { z28.h }, p3/Z, [x19, x16]\n" + "inch x16\n" + ".inst 0x448643ed // smlalb z13.s, p4/M, z31.h, z6.h\n" + "whilelt p2.s, x16, x8\n" + ".inst 0x448647ef // smlalt z15.s, p4/M, z31.h, z6.h\n" + "mov x19, x16\n" + ".inst 0x448343f7 // smlalb z23.s, p4/M, z31.h, z3.h\n" + "incw x19\n" + ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n" + "whilelt p1.s, x19, x8\n" + ".inst 0x448347f6 // smlalt z22.s, p4/M, z31.h, z3.h\n" + "whilelt p3.h, x16, x8\n" + ".inst 0x04b575ad // sqrdmulh z13.s, z13.s, z21.s\n" + ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n" + ".inst 0x448843d9 // smlalb z25.s, p4/M, z30.h, z8.h\n" + ".inst 0x448847c9 // smlalt z9.s, p4/M, z30.h, z8.h\n" + "and z4.d, z13.d, z20.d\n" + "and z16.d, z15.d, z19.d\n" + ".inst 0x04b57739 // sqrdmulh z25.s, z25.s, z21.s\n" + "asr z4.s, z4.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n" + "sqadd z13.s, z13.s, z4.s\n" + "sqadd z15.s, z15.s, z16.s\n" + "and z2.d, z25.d, z20.d\n" + "and z16.d, z9.d, z19.d\n" + ".inst 0x448543ca // smlalb z10.s, p4/M, z30.h, z5.h\n" + "asr z2.s, z2.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x448547d8 // smlalt z24.s, p4/M, z30.h, z5.h\n" + "sqadd z25.s, z25.s, z2.s\n" + "sqadd z9.s, z9.s, z16.s\n" + ".inst 0x448743b7 // smlalb z23.s, p4/M, z29.h, z7.h\n" + ".inst 0x448747b6 // smlalt z22.s, p4/M, z29.h, z7.h\n" + ".inst 0x448643aa // smlalb z10.s, p4/M, z29.h, z6.h\n" + ".inst 0x448647b8 // smlalt z24.s, p4/M, z29.h, z6.h\n" + ".inst 0x44884397 // smlalb z23.s, p4/M, z28.h, z8.h\n" + ".inst 0x44884796 // smlalt z22.s, p4/M, z28.h, z8.h\n" + ".inst 0x4487438a // smlalb z10.s, p4/M, z28.h, z7.h\n" + ".inst 0x44874798 // smlalt z24.s, p4/M, z28.h, z7.h\n" + ".inst 0x04b576f7 // sqrdmulh z23.s, z23.s, z21.s\n" + ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n" + ".inst 0x04b5754a // sqrdmulh z10.s, z10.s, z21.s\n" + ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n" + "and z18.d, z23.d, z20.d\n" + "and z0.d, z22.d, z19.d\n" + "and z16.d, z10.d, z20.d\n" + "asr z18.s, z18.s, #0x1f\n" + "asr z0.s, z0.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z23.s, z23.s, z18.s\n" + "sqadd z22.s, z22.s, z0.s\n" + "sqadd z10.s, z10.s, z16.s\n" + "and z16.d, z24.d, z19.d\n" + ".inst 0x4482928d // srshl z13.s, p4/M, z13.s, z20.s\n" + ".inst 0x4482926f // srshl z15.s, p4/M, z15.s, z19.s\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x44829299 // srshl z25.s, p4/M, z25.s, z20.s\n" + "add z13.s, z13.s, z12.s\n" + "add z15.s, z15.s, z12.s\n" + "sqadd z24.s, z24.s, z16.s\n" + "add z25.s, z25.s, z12.s\n" + "smin z13.s, p4/M, z13.s, z17.s\n" + "smin z15.s, p4/M, z15.s, z17.s\n" + "smin z25.s, p4/M, z25.s, z17.s\n" + ".inst 0x44829269 // srshl z9.s, p4/M, z9.s, z19.s\n" + "smax z13.s, p4/M, z13.s, z14.s\n" + "smax z15.s, p4/M, z15.s, z14.s\n" + "smax z25.s, p4/M, z25.s, z14.s\n" + "add z9.s, z9.s, z12.s\n" + ".inst 0x44829297 // srshl z23.s, p4/M, z23.s, z20.s\n" + "trn1 z13.h, z13.h, z15.h\n" + "st1b { z13.h }, p0, [x11, x15]\n" + "smin z9.s, p4/M, z9.s, z17.s\n" + ".inst 0x44829276 // srshl z22.s, p4/M, z22.s, z19.s\n" + "add z23.s, z23.s, z12.s\n" + ".inst 0x4482928a // srshl z10.s, p4/M, z10.s, z20.s\n" + ".inst 0x44829278 // srshl z24.s, p4/M, z24.s, z19.s\n" + "add z22.s, z22.s, z12.s\n" + "smax z9.s, p4/M, z9.s, z14.s\n" + "add z10.s, z10.s, z12.s\n" + "add z24.s, z24.s, z12.s\n" + "smin z23.s, p4/M, z23.s, z17.s\n" + "trn1 z25.h, z25.h, z9.h\n" + "st1b { z25.h }, p0, [x10, x15]\n" + "smin z22.s, p4/M, z22.s, z17.s\n" + "smin z10.s, p4/M, z10.s, z17.s\n" + "smax z23.s, p4/M, z23.s, z14.s\n" + "smin z24.s, p4/M, z24.s, z17.s\n" + "smax z22.s, p4/M, z22.s, z14.s\n" + "smax z10.s, p4/M, z10.s, z14.s\n" + "smax z24.s, p4/M, z24.s, z14.s\n" + "trn1 z23.h, z23.h, z22.h\n" + "st1b { z23.h }, p0, [x9, x15]\n" + "trn1 z10.h, z10.h, z24.h\n" + "st1b { z10.h }, p0, [x28, x15]\n" + "inch x15\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z10.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z10.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z15.s, z10.s, z16.s\n" + "mov z25.d, z13.d\n" + "ld1sb { z0.h }, p4/Z, [x17]\n" + "mov z23.d, z13.d\n" + "ld1sb { z1.h }, p4/Z, [x17, #1, MUL VL]\n" + "mov z9.d, z15.d\n" + "ld1sb { z2.h }, p4/Z, [x17, #2, MUL VL]\n" + "mov z22.d, z15.d\n" + "ld1sb { z3.h }, p4/Z, [x17, #3, MUL VL]\n" + "mov z10.d, z13.d\n" + "ld1sb { z4.h }, p4/Z, [x17, #4, MUL VL]\n" + "mov z24.d, z15.d\n" + "ld1sb { z5.h }, p4/Z, [x17, #5, MUL VL]\n" + ".inst 0x455a1000 // ssublb z0.h, z0.b, z26.b\n" + "ld1sb { z6.h }, p4/Z, [x17, #6, MUL VL]\n" + ".inst 0x455a1021 // ssublb z1.h, z1.b, z26.b\n" + "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n" + "inch x17, ALL, MUL #8\n" + ".inst 0x455a1042 // ssublb z2.h, z2.b, z26.b\n" + "ld1sb { z8.h }, p4/Z, [x17]\n" + ".inst 0x455a1063 // ssublb z3.h, z3.b, z26.b\n" + "ldp x23, x22, [x13, #0x0]\n" + ".inst 0x455a1084 // ssublb z4.h, z4.b, z26.b\n" + "ldp x21, x20, [x13, #0x10]\n" + ".inst 0x455a10a5 // ssublb z5.h, z5.b, z26.b\n" + ".inst 0x455a10c6 // ssublb z6.h, z6.b, z26.b\n" + "ldr x19, [x13, #0x20]\n" + ".inst 0x455a10e7 // ssublb z7.h, z7.b, z26.b\n" + ".inst 0x455a1108 // ssublb z8.h, z8.b, z26.b\n" + "ld1b { z31.h }, p3/Z, [x23, x16]\n" + "ld1b { z30.h }, p3/Z, [x22, x16]\n" + ".inst 0x454b1bff // usublb z31.h, z31.b, z11.b\n" + "ld1b { z29.h }, p3/Z, [x21, x16]\n" + ".inst 0x454b1bde // usublb z30.h, z30.b, z11.b\n" + "ld1b { z28.h }, p3/Z, [x20, x16]\n" + "ld1b { z27.h }, p3/Z, [x19, x16]\n" + ".inst 0x454b1bbd // usublb z29.h, z29.b, z11.b\n" + ".inst 0x454b1b9c // usublb z28.h, z28.b, z11.b\n" + ".inst 0x454b1b7b // usublb z27.h, z27.b, z11.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..dc33a3fe3f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 5; + constexpr static unsigned int input_cols = 5; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + + kern_type kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + + sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..63960f08e1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x5, [%x[params], %[offsetof_Params_weights]]\n" + "mov x6, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x7, #0x0\n" + "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x17, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z16.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z12.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z14.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z17.s }, p4/Z, [x20]\n" + "whilelt p3.h, x6, x4\n" + "ld1rw { z15.s }, p4/Z, [x19]\n" + "whilelt p2.s, x6, x4\n" + "ldp x15, x14, [x21, #0x0]\n" + "mov x19, x6\n" + "incw x19\n" + "ldp x13, x12, [x21, #0x10]\n" + "whilelt p1.s, x19, x4\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z20.s }, p2/Z, [x19]\n" + "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z20.s, z10.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z20.s, z20.s, z10.s\n" + "mov z11.d, z13.d\n" + "ld1sb { z0.h }, p4/Z, [x5]\n" + "mov z9.d, z13.d\n" + "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n" + "mov z18.d, z20.d\n" + "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n" + "mov z19.d, z20.d\n" + "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n" + "mov z23.d, z13.d\n" + "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n" + "mov z21.d, z20.d\n" + "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n" + ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n" + "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n" + ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n" + "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n" + "inch x5, ALL, MUL #8\n" + ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n" + "ld1sb { z8.h }, p4/Z, [x5]\n" + ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n" + "ldp x26, x25, [x17, #0x0]\n" + ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n" + "ldp x24, x23, [x17, #0x10]\n" + ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n" + ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n" + "ldp x22, x21, [x17, #0x20]\n" + ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n" + ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n" + "ldp x20, x19, [x17, #0x30]\n" + "ld1b { z31.h }, p3/Z, [x26, x6]\n" + ".inst 0x45501bff // usublb z31.h, z31.b, z16.b\n" + "ld1b { z30.h }, p3/Z, [x25, x6]\n" + "ld1b { z29.h }, p3/Z, [x24, x6]\n" + ".inst 0x45501bde // usublb z30.h, z30.b, z16.b\n" + "ld1b { z28.h }, p3/Z, [x23, x6]\n" + ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n" + "ld1b { z27.h }, p3/Z, [x22, x6]\n" + "ld1b { z26.h }, p3/Z, [x21, x6]\n" + ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n" + "ld1b { z25.h }, p3/Z, [x20, x6]\n" + "ld1b { z24.h }, p3/Z, [x19, x6]\n" + ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n" + ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n" + ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n" + ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n" + "1:" // Loop + ".inst 0x448843ed // smlalb z13.s, p4/M, z31.h, z8.h\n" + "ldr x22, [x17, #0x40]\n" + "whilelt p0.h, x7, x4\n" + ".inst 0x448847f4 // smlalt z20.s, p4/M, z31.h, z8.h\n" + "ldr x21, [x17, #0x48]\n" + "inch x5\n" + ".inst 0x448643eb // smlalb z11.s, p4/M, z31.h, z6.h\n" + "ldr x20, [x17, #0x50]\n" + ".inst 0x448647f2 // smlalt z18.s, p4/M, z31.h, z6.h\n" + "ldr x19, [x17, #0x58]\n" + ".inst 0x448243e9 // smlalb z9.s, p4/M, z31.h, z2.h\n" + "ldr x11, [x17, #0x60]\n" + ".inst 0x448247f3 // smlalt z19.s, p4/M, z31.h, z2.h\n" + "ldr x10, [x17, #0x68]\n" + ".inst 0x448043f7 // smlalb z23.s, p4/M, z31.h, z0.h\n" + "ldr x9, [x17, #0x70]\n" + ".inst 0x448047f5 // smlalt z21.s, p4/M, z31.h, z0.h\n" + "ldr x28, [x17, #0x78]\n" + ".inst 0x448043cd // smlalb z13.s, p4/M, z30.h, z0.h\n" + "ldr x27, [x17, #0x80]\n" + ".inst 0x448047d4 // smlalt z20.s, p4/M, z30.h, z0.h\n" + "ldr x26, [x17, #0x88]\n" + ".inst 0x4481438b // smlalb z11.s, p4/M, z28.h, z1.h\n" + "ldr x25, [x17, #0x90]\n" + ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n" + "ld1b { z28.h }, p3/Z, [x21, x6]\n" + ".inst 0x448143ad // smlalb z13.s, p4/M, z29.h, z1.h\n" + "ldr x24, [x17, #0x98]\n" + ".inst 0x448147b4 // smlalt z20.s, p4/M, z29.h, z1.h\n" + "ld1b { z29.h }, p3/Z, [x22, x6]\n" + ".inst 0x4482436b // smlalb z11.s, p4/M, z27.h, z2.h\n" + "ldr x23, [x17, #0xa0]\n" + ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n" + "ldr x22, [x17, #0xa8]\n" + ".inst 0x44824772 // smlalt z18.s, p4/M, z27.h, z2.h\n" + "ld1b { z27.h }, p3/Z, [x20, x6]\n" + ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n" + "ldr x21, [x17, #0xb0]\n" + ".inst 0x4483434d // smlalb z13.s, p4/M, z26.h, z3.h\n" + "ldr x20, [x17, #0xb8]\n" + ".inst 0x44834754 // smlalt z20.s, p4/M, z26.h, z3.h\n" + "ld1b { z26.h }, p3/Z, [x19, x6]\n" + ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n" + "ldr x19, [x17, #0xc0]\n" + ".inst 0x4480430b // smlalb z11.s, p4/M, z24.h, z0.h\n" + "ld1w { z10.s }, p2/Z, [x8]\n" + ".inst 0x4484432d // smlalb z13.s, p4/M, z25.h, z4.h\n" + "ld1w { z22.s }, p1/Z, [x8, #1, MUL VL]\n" + "addvl x8, x8, #2\n" + ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n" + ".inst 0x44844734 // smlalt z20.s, p4/M, z25.h, z4.h\n" + "ld1b { z25.h }, p3/Z, [x11, x6]\n" + ".inst 0x44804712 // smlalt z18.s, p4/M, z24.h, z0.h\n" + "uzp1 z31.s, z10.s, z22.s\n" + "uzp2 z30.s, z10.s, z22.s\n" + "ld1w { z10.s }, p2/Z, [x16]\n" + ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n" + "ld1w { z22.s }, p1/Z, [x16, #1, MUL VL]\n" + "addvl x16, x16, #2\n" + ".inst 0x4482430d // smlalb z13.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824714 // smlalt z20.s, p4/M, z24.h, z2.h\n" + "ld1b { z24.h }, p3/Z, [x9, x6]\n" + ".inst 0x448443ab // smlalb z11.s, p4/M, z29.h, z4.h\n" + ".inst 0x448447b2 // smlalt z18.s, p4/M, z29.h, z4.h\n" + "ld1b { z29.h }, p3/Z, [x10, x6]\n" + ".inst 0x44834349 // smlalb z9.s, p4/M, z26.h, z3.h\n" + ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n" + ".inst 0x4485438b // smlalb z11.s, p4/M, z28.h, z5.h\n" + ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n" + ".inst 0x44854792 // smlalt z18.s, p4/M, z28.h, z5.h\n" + "ld1b { z28.h }, p3/Z, [x27, x6]\n" + ".inst 0x4485436d // smlalb z13.s, p4/M, z27.h, z5.h\n" + ".inst 0x44854774 // smlalt z20.s, p4/M, z27.h, z5.h\n" + ".inst 0x4483436b // smlalb z11.s, p4/M, z27.h, z3.h\n" + ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n" + ".inst 0x44834772 // smlalt z18.s, p4/M, z27.h, z3.h\n" + "ld1b { z27.h }, p3/Z, [x28, x6]\n" + ".inst 0x44834753 // smlalt z19.s, p4/M, z26.h, z3.h\n" + "ld1b { z26.h }, p3/Z, [x26, x6]\n" + ".inst 0x4486432d // smlalb z13.s, p4/M, z25.h, z6.h\n" + ".inst 0x44864734 // smlalt z20.s, p4/M, z25.h, z6.h\n" + ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n" + ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n" + ".inst 0x44804329 // smlalb z9.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804733 // smlalt z19.s, p4/M, z25.h, z0.h\n" + "ld1b { z25.h }, p3/Z, [x25, x6]\n" + "uzp1 z0.s, z10.s, z22.s\n" + "uzp2 z22.s, z10.s, z22.s\n" + ".inst 0x448443a9 // smlalb z9.s, p4/M, z29.h, z4.h\n" + ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n" + ".inst 0x448447b3 // smlalt z19.s, p4/M, z29.h, z4.h\n" + "ld1b { z29.h }, p3/Z, [x24, x6]\n" + ".inst 0x4487430d // smlalb z13.s, p4/M, z24.h, z7.h\n" + ".inst 0x44874714 // smlalt z20.s, p4/M, z24.h, z7.h\n" + ".inst 0x44814309 // smlalb z9.s, p4/M, z24.h, z1.h\n" + ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n" + ".inst 0x04bf75ad // sqrdmulh z13.s, z13.s, z31.s\n" + ".inst 0x04be7694 // sqrdmulh z20.s, z20.s, z30.s\n" + ".inst 0x44814713 // smlalt z19.s, p4/M, z24.h, z1.h\n" + "ld1b { z24.h }, p3/Z, [x22, x6]\n" + ".inst 0x44844377 // smlalb z23.s, p4/M, z27.h, z4.h\n" + "and z10.d, z13.d, z0.d\n" + ".inst 0x44844775 // smlalt z21.s, p4/M, z27.h, z4.h\n" + "ld1b { z27.h }, p3/Z, [x23, x6]\n" + ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n" + "asr z10.s, z10.s, #0x1f\n" + "and z4.d, z20.d, z22.d\n" + ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n" + "sqadd z13.s, z13.s, z10.s\n" + "asr z4.s, z4.s, #0x1f\n" + ".inst 0x4487438b // smlalb z11.s, p4/M, z28.h, z7.h\n" + ".inst 0x44874792 // smlalt z18.s, p4/M, z28.h, z7.h\n" + "sqadd z20.s, z20.s, z4.s\n" + ".inst 0x44814397 // smlalb z23.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814795 // smlalt z21.s, p4/M, z28.h, z1.h\n" + ".inst 0x44864329 // smlalb z9.s, p4/M, z25.h, z6.h\n" + ".inst 0x44864733 // smlalt z19.s, p4/M, z25.h, z6.h\n" + "ld1b { z25.h }, p3/Z, [x20, x6]\n" + ".inst 0x44854357 // smlalb z23.s, p4/M, z26.h, z5.h\n" + ".inst 0x44854755 // smlalt z21.s, p4/M, z26.h, z5.h\n" + "ld1b { z26.h }, p3/Z, [x21, x6]\n" + ".inst 0x448843ab // smlalb z11.s, p4/M, z29.h, z8.h\n" + ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n" + ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n" + ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n" + ".inst 0x04bf756b // sqrdmulh z11.s, z11.s, z31.s\n" + ".inst 0x448243b7 // smlalb z23.s, p4/M, z29.h, z2.h\n" + ".inst 0x04be7652 // sqrdmulh z18.s, z18.s, z30.s\n" + ".inst 0x448247b5 // smlalt z21.s, p4/M, z29.h, z2.h\n" + "ld1b { z29.h }, p3/Z, [x19, x6]\n" + "inch x6\n" + "and z2.d, z11.d, z0.d\n" + "whilelt p2.s, x6, x4\n" + ".inst 0x44874369 // smlalb z9.s, p4/M, z27.h, z7.h\n" + "mov x19, x6\n" + "and z10.d, z18.d, z22.d\n" + "incw x19\n" + ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n" + "whilelt p1.s, x19, x4\n" + "asr z2.s, z2.s, #0x1f\n" + "whilelt p3.h, x6, x4\n" + "asr z10.s, z10.s, #0x1f\n" + ".inst 0x44874773 // smlalt z19.s, p4/M, z27.h, z7.h\n" + "sqadd z11.s, z11.s, z2.s\n" + "sqadd z18.s, z18.s, z10.s\n" + ".inst 0x44854309 // smlalb z9.s, p4/M, z24.h, z5.h\n" + ".inst 0x44854713 // smlalt z19.s, p4/M, z24.h, z5.h\n" + ".inst 0x44834317 // smlalb z23.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834715 // smlalt z21.s, p4/M, z24.h, z3.h\n" + ".inst 0x44884329 // smlalb z9.s, p4/M, z25.h, z8.h\n" + ".inst 0x44884733 // smlalt z19.s, p4/M, z25.h, z8.h\n" + ".inst 0x44874357 // smlalb z23.s, p4/M, z26.h, z7.h\n" + ".inst 0x44874755 // smlalt z21.s, p4/M, z26.h, z7.h\n" + ".inst 0x04bf7529 // sqrdmulh z9.s, z9.s, z31.s\n" + ".inst 0x04be7673 // sqrdmulh z19.s, z19.s, z30.s\n" + ".inst 0x44864337 // smlalb z23.s, p4/M, z25.h, z6.h\n" + ".inst 0x44864735 // smlalt z21.s, p4/M, z25.h, z6.h\n" + "and z10.d, z9.d, z0.d\n" + "and z24.d, z19.d, z22.d\n" + ".inst 0x448843b7 // smlalb z23.s, p4/M, z29.h, z8.h\n" + "asr z10.s, z10.s, #0x1f\n" + "asr z24.s, z24.s, #0x1f\n" + ".inst 0x448847b5 // smlalt z21.s, p4/M, z29.h, z8.h\n" + "sqadd z9.s, z9.s, z10.s\n" + "sqadd z19.s, z19.s, z24.s\n" + ".inst 0x04bf76f7 // sqrdmulh z23.s, z23.s, z31.s\n" + ".inst 0x04be76b5 // sqrdmulh z21.s, z21.s, z30.s\n" + ".inst 0x4482900d // srshl z13.s, p4/M, z13.s, z0.s\n" + ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n" + "and z30.d, z23.d, z0.d\n" + "and z28.d, z21.d, z22.d\n" + "add z13.s, z13.s, z14.s\n" + "add z20.s, z20.s, z14.s\n" + "asr z30.s, z30.s, #0x1f\n" + "asr z28.s, z28.s, #0x1f\n" + "smin z13.s, p4/M, z13.s, z15.s\n" + "sqadd z23.s, z23.s, z30.s\n" + "sqadd z21.s, z21.s, z28.s\n" + "smin z20.s, p4/M, z20.s, z15.s\n" + "smax z13.s, p4/M, z13.s, z17.s\n" + ".inst 0x4482900b // srshl z11.s, p4/M, z11.s, z0.s\n" + ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n" + "smax z20.s, p4/M, z20.s, z17.s\n" + ".inst 0x44829009 // srshl z9.s, p4/M, z9.s, z0.s\n" + "add z11.s, z11.s, z14.s\n" + "add z18.s, z18.s, z14.s\n" + "trn1 z13.h, z13.h, z20.h\n" + "st1b { z13.h }, p0, [x15, x7]\n" + "add z9.s, z9.s, z14.s\n" + "smin z11.s, p4/M, z11.s, z15.s\n" + "smin z18.s, p4/M, z18.s, z15.s\n" + ".inst 0x448292d3 // srshl z19.s, p4/M, z19.s, z22.s\n" + "smin z9.s, p4/M, z9.s, z15.s\n" + "smax z11.s, p4/M, z11.s, z17.s\n" + "smax z18.s, p4/M, z18.s, z17.s\n" + "add z19.s, z19.s, z14.s\n" + "smax z9.s, p4/M, z9.s, z17.s\n" + ".inst 0x44829017 // srshl z23.s, p4/M, z23.s, z0.s\n" + "trn1 z11.h, z11.h, z18.h\n" + "st1b { z11.h }, p0, [x14, x7]\n" + "smin z19.s, p4/M, z19.s, z15.s\n" + ".inst 0x448292d5 // srshl z21.s, p4/M, z21.s, z22.s\n" + "add z23.s, z23.s, z14.s\n" + "add z21.s, z21.s, z14.s\n" + "smax z19.s, p4/M, z19.s, z17.s\n" + "smin z23.s, p4/M, z23.s, z15.s\n" + "smin z21.s, p4/M, z21.s, z15.s\n" + "trn1 z9.h, z9.h, z19.h\n" + "st1b { z9.h }, p0, [x13, x7]\n" + "smax z23.s, p4/M, z23.s, z17.s\n" + "smax z21.s, p4/M, z21.s, z17.s\n" + "trn1 z23.h, z23.h, z21.h\n" + "st1b { z23.h }, p0, [x12, x7]\n" + "inch x7\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z20.s }, p2/Z, [x19]\n" + "ld1w { z10.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z13.s, z20.s, z10.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z20.s, z20.s, z10.s\n" + "mov z11.d, z13.d\n" + "ld1sb { z0.h }, p4/Z, [x5]\n" + "mov z9.d, z13.d\n" + "ld1sb { z1.h }, p4/Z, [x5, #1, MUL VL]\n" + "mov z18.d, z20.d\n" + "ld1sb { z2.h }, p4/Z, [x5, #2, MUL VL]\n" + "mov z19.d, z20.d\n" + "ld1sb { z3.h }, p4/Z, [x5, #3, MUL VL]\n" + "mov z23.d, z13.d\n" + "ld1sb { z4.h }, p4/Z, [x5, #4, MUL VL]\n" + "mov z21.d, z20.d\n" + "ld1sb { z5.h }, p4/Z, [x5, #5, MUL VL]\n" + ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n" + "ld1sb { z6.h }, p4/Z, [x5, #6, MUL VL]\n" + ".inst 0x454c1021 // ssublb z1.h, z1.b, z12.b\n" + "ld1sb { z7.h }, p4/Z, [x5, #7, MUL VL]\n" + "inch x5, ALL, MUL #8\n" + ".inst 0x454c1042 // ssublb z2.h, z2.b, z12.b\n" + "ld1sb { z8.h }, p4/Z, [x5]\n" + ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n" + "ldp x26, x25, [x17, #0x0]\n" + ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n" + "ldp x24, x23, [x17, #0x10]\n" + ".inst 0x454c10a5 // ssublb z5.h, z5.b, z12.b\n" + ".inst 0x454c10c6 // ssublb z6.h, z6.b, z12.b\n" + "ldp x22, x21, [x17, #0x20]\n" + ".inst 0x454c10e7 // ssublb z7.h, z7.b, z12.b\n" + ".inst 0x454c1108 // ssublb z8.h, z8.b, z12.b\n" + "ldp x20, x19, [x17, #0x30]\n" + "ld1b { z31.h }, p3/Z, [x26, x6]\n" + ".inst 0x45501bff // usublb z31.h, z31.b, z16.b\n" + "ld1b { z30.h }, p3/Z, [x25, x6]\n" + "ld1b { z29.h }, p3/Z, [x24, x6]\n" + ".inst 0x45501bde // usublb z30.h, z30.b, z16.b\n" + "ld1b { z28.h }, p3/Z, [x23, x6]\n" + ".inst 0x45501bbd // usublb z29.h, z29.b, z16.b\n" + "ld1b { z27.h }, p3/Z, [x22, x6]\n" + "ld1b { z26.h }, p3/Z, [x21, x6]\n" + ".inst 0x45501b9c // usublb z28.h, z28.b, z16.b\n" + "ld1b { z25.h }, p3/Z, [x20, x6]\n" + "ld1b { z24.h }, p3/Z, [x19, x6]\n" + ".inst 0x45501b7b // usublb z27.h, z27.b, z16.b\n" + ".inst 0x45501b5a // usublb z26.h, z26.b, z16.b\n" + ".inst 0x45501b39 // usublb z25.h, z25.b, z16.b\n" + ".inst 0x45501b18 // usublb z24.h, z24.b, z16.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..906ef36c8f --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +struct sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +{ + typedef int32_t bias_type; + typedef uint8_t input_type; + typedef int8_t weight_type; + typedef uint8_t return_type; + + constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + + typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); + typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + constexpr static unsigned int output_rows = 2; + constexpr static unsigned int output_cols = 2; + + constexpr static unsigned int input_rows = 6; + constexpr static unsigned int input_cols = 6; + + constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters; + constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size; + + kern_type kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + + sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..6c321efa29 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) + +namespace arm_conv { +namespace depthwise { + +void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const int8_t *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const int8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" + "ptrue p4.b\n" + "ldr x1, [%x[params], %[offsetof_Params_weights]]\n" + "mov x2, #0x0\n" + "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" + "mov x3, #0x0\n" + "ldr x4, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x5, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "add x19, x22, %[offsetof_Requantize32_a_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x22, %[offsetof_Requantize32_b_offset]\n" + "ld1rb { z9.b }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_c_offset]\n" + "ld1rb { z14.b }, p4/Z, [x20]\n" + "add x20, x22, %[offsetof_Requantize32_minval]\n" + "ld1rw { z17.s }, p4/Z, [x19]\n" + "add x19, x22, %[offsetof_Requantize32_maxval]\n" + "ld1rw { z12.s }, p4/Z, [x20]\n" + "whilelt p3.h, x2, x0\n" + "ld1rw { z11.s }, p4/Z, [x19]\n" + "whilelt p2.s, x2, x0\n" + "ldp x7, x8, [x21, #0x0]\n" + "mov x19, x2\n" + "incw x19\n" + "ldp x17, x16, [x21, #0x10]\n" + "whilelt p1.s, x19, x0\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z4.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z15.s, z4.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z18.s, z4.s, z16.s\n" + "mov z21.d, z15.d\n" + "ld1sb { z0.h }, p4/Z, [x1]\n" + "mov z5.d, z15.d\n" + "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n" + "mov z13.d, z18.d\n" + "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n" + "mov z7.d, z18.d\n" + "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n" + "mov z6.d, z15.d\n" + "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n" + "mov z8.d, z18.d\n" + "ldp x28, x27, [x5, #0x0]\n" + ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n" + "ldp x26, x25, [x5, #0x10]\n" + ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n" + ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n" + "ldp x24, x23, [x5, #0x20]\n" + ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n" + ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n" + "ldp x22, x21, [x5, #0x30]\n" + "ldp x20, x19, [x5, #0x40]\n" + "ld1b { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n" + "ld1b { z30.h }, p3/Z, [x27, x2]\n" + "ld1b { z29.h }, p3/Z, [x26, x2]\n" + ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n" + "ld1b { z28.h }, p3/Z, [x25, x2]\n" + ".inst 0x45491bbd // usublb z29.h, z29.b, z9.b\n" + "ld1b { z27.h }, p3/Z, [x24, x2]\n" + "ld1b { z23.h }, p3/Z, [x23, x2]\n" + ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n" + "ld1b { z25.h }, p3/Z, [x22, x2]\n" + "ld1b { z24.h }, p3/Z, [x21, x2]\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n" + "ld1b { z26.h }, p3/Z, [x20, x2]\n" + "ld1b { z22.h }, p3/Z, [x19, x2]\n" + ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n" + ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n" + ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n" + ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n" + "1:" // Loop + ".inst 0x448043ef // smlalb z15.s, p4/M, z31.h, z0.h\n" + "ldr x20, [x5, #0x50]\n" + "whilelt p0.h, x3, x0\n" + ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n" + "ldr x19, [x5, #0x58]\n" + ".inst 0x448043d5 // smlalb z21.s, p4/M, z30.h, z0.h\n" + "ldr x25, [x5, #0x60]\n" + ".inst 0x448047cd // smlalt z13.s, p4/M, z30.h, z0.h\n" + "ld1b { z31.h }, p3/Z, [x20, x2]\n" + ".inst 0x448043a5 // smlalb z5.s, p4/M, z29.h, z0.h\n" + "ldr x24, [x5, #0x68]\n" + ".inst 0x448047a7 // smlalt z7.s, p4/M, z29.h, z0.h\n" + "ldr x23, [x5, #0x70]\n" + ".inst 0x44804386 // smlalb z6.s, p4/M, z28.h, z0.h\n" + "ldr x22, [x5, #0x78]\n" + ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n" + "ldr x15, [x5, #0x80]\n" + ".inst 0x44804788 // smlalt z8.s, p4/M, z28.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x448143cf // smlalb z15.s, p4/M, z30.h, z1.h\n" + "ldr x21, [x5, #0x88]\n" + ".inst 0x448147d2 // smlalt z18.s, p4/M, z30.h, z1.h\n" + "ld1b { z30.h }, p3/Z, [x19, x2]\n" + ".inst 0x44814375 // smlalb z21.s, p4/M, z27.h, z1.h\n" + "ldr x20, [x5, #0x90]\n" + ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n" + "ldr x19, [x5, #0x98]\n" + ".inst 0x4481476d // smlalt z13.s, p4/M, z27.h, z1.h\n" + "ldr x14, [x5, #0xa0]\n" + ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n" + "ldr x13, [x5, #0xa8]\n" + ".inst 0x44814385 // smlalb z5.s, p4/M, z28.h, z1.h\n" + "ldr x12, [x5, #0xb0]\n" + ".inst 0x44814787 // smlalt z7.s, p4/M, z28.h, z1.h\n" + "ldr x11, [x5, #0xb8]\n" + ".inst 0x448142e6 // smlalb z6.s, p4/M, z23.h, z1.h\n" + "ldr x10, [x5, #0xc0]\n" + ".inst 0x448146e8 // smlalt z8.s, p4/M, z23.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x4482436f // smlalb z15.s, p4/M, z27.h, z2.h\n" + "ldr x9, [x5, #0xc8]\n" + ".inst 0x44824772 // smlalt z18.s, p4/M, z27.h, z2.h\n" + "ld1b { z27.h }, p3/Z, [x25, x2]\n" + ".inst 0x44824335 // smlalb z21.s, p4/M, z25.h, z2.h\n" + "ldr x28, [x5, #0xd0]\n" + ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n" + "ldr x27, [x5, #0xd8]\n" + ".inst 0x4482472d // smlalt z13.s, p4/M, z25.h, z2.h\n" + "ldr x26, [x5, #0xe0]\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + "ld1w { z19.s }, p2/Z, [x4]\n" + ".inst 0x448242e5 // smlalb z5.s, p4/M, z23.h, z2.h\n" + "ld1w { z16.s }, p1/Z, [x4, #1, MUL VL]\n" + "addvl x4, x4, #2\n" + ".inst 0x448246e7 // smlalt z7.s, p4/M, z23.h, z2.h\n" + ".inst 0x448243e6 // smlalb z6.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247e8 // smlalt z8.s, p4/M, z31.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + "uzp1 z10.s, z19.s, z16.s\n" + "uzp2 z20.s, z19.s, z16.s\n" + "ld1w { z19.s }, p2/Z, [x6]\n" + ".inst 0x4483432f // smlalb z15.s, p4/M, z25.h, z3.h\n" + "ld1w { z16.s }, p1/Z, [x6, #1, MUL VL]\n" + "addvl x6, x6, #2\n" + ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n" + ".inst 0x44834732 // smlalt z18.s, p4/M, z25.h, z3.h\n" + "ld1b { z25.h }, p3/Z, [x24, x2]\n" + ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n" + "ldr x25, [x5, #0xe8]\n" + ".inst 0x4483470d // smlalt z13.s, p4/M, z24.h, z3.h\n" + ".inst 0x448343e5 // smlalb z5.s, p4/M, z31.h, z3.h\n" + ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n" + ".inst 0x448347e7 // smlalt z7.s, p4/M, z31.h, z3.h\n" + ".inst 0x448343c6 // smlalb z6.s, p4/M, z30.h, z3.h\n" + ".inst 0x448347c8 // smlalt z8.s, p4/M, z30.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1]\n" + ".inst 0x4484430f // smlalb z15.s, p4/M, z24.h, z4.h\n" + ".inst 0x44844712 // smlalt z18.s, p4/M, z24.h, z4.h\n" + "ld1b { z24.h }, p3/Z, [x23, x2]\n" + ".inst 0x44844375 // smlalb z21.s, p4/M, z27.h, z4.h\n" + "ldr x24, [x5, #0xf0]\n" + ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n" + ".inst 0x4484476d // smlalt z13.s, p4/M, z27.h, z4.h\n" + "ld1b { z27.h }, p3/Z, [x22, x2]\n" + ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n" + "ldr x23, [x5, #0xf8]\n" + ".inst 0x448443c5 // smlalb z5.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447c7 // smlalt z7.s, p4/M, z30.h, z4.h\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1, #1, MUL VL]\n" + ".inst 0x448043af // smlalb z15.s, p4/M, z29.h, z0.h\n" + ".inst 0x448047b2 // smlalt z18.s, p4/M, z29.h, z0.h\n" + "uzp1 z29.s, z19.s, z16.s\n" + ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n" + "uzp2 z19.s, z19.s, z16.s\n" + ".inst 0x44804395 // smlalb z21.s, p4/M, z28.h, z0.h\n" + ".inst 0x4480478d // smlalt z13.s, p4/M, z28.h, z0.h\n" + ".inst 0x448042c5 // smlalb z5.s, p4/M, z22.h, z0.h\n" + ".inst 0x448046c7 // smlalt z7.s, p4/M, z22.h, z0.h\n" + ".inst 0x44804326 // smlalb z6.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804728 // smlalt z8.s, p4/M, z25.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x4481438f // smlalb z15.s, p4/M, z28.h, z1.h\n" + ".inst 0x44814792 // smlalt z18.s, p4/M, z28.h, z1.h\n" + "ld1b { z28.h }, p3/Z, [x21, x2]\n" + ".inst 0x448142f5 // smlalb z21.s, p4/M, z23.h, z1.h\n" + "ldr x22, [x5, #0x100]\n" + ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n" + ".inst 0x448146ed // smlalt z13.s, p4/M, z23.h, z1.h\n" + ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n" + ".inst 0x44814325 // smlalb z5.s, p4/M, z25.h, z1.h\n" + ".inst 0x44814727 // smlalt z7.s, p4/M, z25.h, z1.h\n" + ".inst 0x44814306 // smlalb z6.s, p4/M, z24.h, z1.h\n" + ".inst 0x44814708 // smlalt z8.s, p4/M, z24.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1, #3, MUL VL]\n" + ".inst 0x448242ef // smlalb z15.s, p4/M, z23.h, z2.h\n" + ".inst 0x448246f2 // smlalt z18.s, p4/M, z23.h, z2.h\n" + "ld1b { z23.h }, p3/Z, [x15, x2]\n" + ".inst 0x448243f5 // smlalb z21.s, p4/M, z31.h, z2.h\n" + "ldr x21, [x5, #0x108]\n" + ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n" + ".inst 0x448247ed // smlalt z13.s, p4/M, z31.h, z2.h\n" + ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n" + ".inst 0x44824305 // smlalb z5.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824707 // smlalt z7.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824366 // smlalb z6.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824768 // smlalt z8.s, p4/M, z27.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x448343ef // smlalb z15.s, p4/M, z31.h, z3.h\n" + ".inst 0x448347f2 // smlalt z18.s, p4/M, z31.h, z3.h\n" + "ld1b { z31.h }, p3/Z, [x20, x2]\n" + ".inst 0x448343d5 // smlalb z21.s, p4/M, z30.h, z3.h\n" + "ldr x20, [x5, #0x110]\n" + ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n" + ".inst 0x448347cd // smlalt z13.s, p4/M, z30.h, z3.h\n" + ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n" + ".inst 0x44834365 // smlalb z5.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834767 // smlalt z7.s, p4/M, z27.h, z3.h\n" + ".inst 0x448342e6 // smlalb z6.s, p4/M, z23.h, z3.h\n" + ".inst 0x448346e8 // smlalt z8.s, p4/M, z23.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x448443cf // smlalb z15.s, p4/M, z30.h, z4.h\n" + ".inst 0x448447d2 // smlalt z18.s, p4/M, z30.h, z4.h\n" + "ld1b { z30.h }, p3/Z, [x19, x2]\n" + ".inst 0x44844355 // smlalb z21.s, p4/M, z26.h, z4.h\n" + "ldr x19, [x5, #0x118]\n" + ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n" + ".inst 0x4484474d // smlalt z13.s, p4/M, z26.h, z4.h\n" + "ld1b { z26.h }, p3/Z, [x14, x2]\n" + ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n" + ".inst 0x448442e5 // smlalb z5.s, p4/M, z23.h, z4.h\n" + ".inst 0x448446e7 // smlalt z7.s, p4/M, z23.h, z4.h\n" + ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n" + ".inst 0x44844386 // smlalb z6.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844788 // smlalt z8.s, p4/M, z28.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x448042cf // smlalb z15.s, p4/M, z22.h, z0.h\n" + ".inst 0x448046d2 // smlalt z18.s, p4/M, z22.h, z0.h\n" + "ld1b { z22.h }, p3/Z, [x11, x2]\n" + ".inst 0x44804335 // smlalb z21.s, p4/M, z25.h, z0.h\n" + ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n" + ".inst 0x4480472d // smlalt z13.s, p4/M, z25.h, z0.h\n" + ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n" + ".inst 0x448043e5 // smlalb z5.s, p4/M, z31.h, z0.h\n" + ".inst 0x448047e7 // smlalt z7.s, p4/M, z31.h, z0.h\n" + ".inst 0x448043c6 // smlalb z6.s, p4/M, z30.h, z0.h\n" + ".inst 0x448047c8 // smlalt z8.s, p4/M, z30.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4481432f // smlalb z15.s, p4/M, z25.h, z1.h\n" + ".inst 0x44814732 // smlalt z18.s, p4/M, z25.h, z1.h\n" + "ld1b { z25.h }, p3/Z, [x13, x2]\n" + ".inst 0x44814315 // smlalb z21.s, p4/M, z24.h, z1.h\n" + ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n" + ".inst 0x4481470d // smlalt z13.s, p4/M, z24.h, z1.h\n" + ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n" + ".inst 0x448143c5 // smlalb z5.s, p4/M, z30.h, z1.h\n" + ".inst 0x448147c7 // smlalt z7.s, p4/M, z30.h, z1.h\n" + ".inst 0x44814346 // smlalb z6.s, p4/M, z26.h, z1.h\n" + ".inst 0x44814748 // smlalt z8.s, p4/M, z26.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1]\n" + ".inst 0x4482430f // smlalb z15.s, p4/M, z24.h, z2.h\n" + ".inst 0x44824712 // smlalt z18.s, p4/M, z24.h, z2.h\n" + "ld1b { z24.h }, p3/Z, [x12, x2]\n" + ".inst 0x44824375 // smlalb z21.s, p4/M, z27.h, z2.h\n" + ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n" + ".inst 0x4482476d // smlalt z13.s, p4/M, z27.h, z2.h\n" + ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n" + ".inst 0x44824345 // smlalb z5.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824747 // smlalt z7.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n" + ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #1, MUL VL]\n" + ".inst 0x4483436f // smlalb z15.s, p4/M, z27.h, z3.h\n" + ".inst 0x44834772 // smlalt z18.s, p4/M, z27.h, z3.h\n" + "ld1b { z27.h }, p3/Z, [x10, x2]\n" + ".inst 0x448342f5 // smlalb z21.s, p4/M, z23.h, z3.h\n" + ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n" + ".inst 0x448346ed // smlalt z13.s, p4/M, z23.h, z3.h\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + ".inst 0x44834325 // smlalb z5.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834727 // smlalt z7.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1, #2, MUL VL]\n" + ".inst 0x448442ef // smlalb z15.s, p4/M, z23.h, z4.h\n" + ".inst 0x448446f2 // smlalt z18.s, p4/M, z23.h, z4.h\n" + "ld1b { z23.h }, p3/Z, [x9, x2]\n" + ".inst 0x44844395 // smlalb z21.s, p4/M, z28.h, z4.h\n" + ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n" + ".inst 0x4484478d // smlalt z13.s, p4/M, z28.h, z4.h\n" + "ld1b { z28.h }, p3/Z, [x26, x2]\n" + ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n" + ".inst 0x44844305 // smlalb z5.s, p4/M, z24.h, z4.h\n" + ".inst 0x44844707 // smlalt z7.s, p4/M, z24.h, z4.h\n" + ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n" + ".inst 0x448442c6 // smlalb z6.s, p4/M, z22.h, z4.h\n" + ".inst 0x448446c8 // smlalt z8.s, p4/M, z22.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1, #3, MUL VL]\n" + ".inst 0x448043ef // smlalb z15.s, p4/M, z31.h, z0.h\n" + ".inst 0x448047f2 // smlalt z18.s, p4/M, z31.h, z0.h\n" + "ld1b { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x448043d5 // smlalb z21.s, p4/M, z30.h, z0.h\n" + ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n" + ".inst 0x448047cd // smlalt z13.s, p4/M, z30.h, z0.h\n" + ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n" + ".inst 0x44804365 // smlalb z5.s, p4/M, z27.h, z0.h\n" + ".inst 0x44804767 // smlalt z7.s, p4/M, z27.h, z0.h\n" + ".inst 0x448042e6 // smlalb z6.s, p4/M, z23.h, z0.h\n" + ".inst 0x448046e8 // smlalt z8.s, p4/M, z23.h, z0.h\n" + "ld1sb { z0.h }, p4/Z, [x1, #4, MUL VL]\n" + ".inst 0x448143cf // smlalb z15.s, p4/M, z30.h, z1.h\n" + ".inst 0x448147d2 // smlalt z18.s, p4/M, z30.h, z1.h\n" + "ld1b { z30.h }, p3/Z, [x27, x2]\n" + ".inst 0x44814355 // smlalb z21.s, p4/M, z26.h, z1.h\n" + ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n" + ".inst 0x4481474d // smlalt z13.s, p4/M, z26.h, z1.h\n" + ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n" + ".inst 0x448142e5 // smlalb z5.s, p4/M, z23.h, z1.h\n" + ".inst 0x448146e7 // smlalt z7.s, p4/M, z23.h, z1.h\n" + ".inst 0x448143e6 // smlalb z6.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147e8 // smlalt z8.s, p4/M, z31.h, z1.h\n" + "ld1sb { z1.h }, p4/Z, [x1, #5, MUL VL]\n" + ".inst 0x4482434f // smlalb z15.s, p4/M, z26.h, z2.h\n" + ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n" + "ld1b { z26.h }, p3/Z, [x25, x2]\n" + ".inst 0x44824335 // smlalb z21.s, p4/M, z25.h, z2.h\n" + ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n" + ".inst 0x4482472d // smlalt z13.s, p4/M, z25.h, z2.h\n" + ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n" + ".inst 0x448243e5 // smlalb z5.s, p4/M, z31.h, z2.h\n" + ".inst 0x448247e7 // smlalt z7.s, p4/M, z31.h, z2.h\n" + ".inst 0x448243c6 // smlalb z6.s, p4/M, z30.h, z2.h\n" + ".inst 0x448247c8 // smlalt z8.s, p4/M, z30.h, z2.h\n" + "ld1sb { z2.h }, p4/Z, [x1, #6, MUL VL]\n" + ".inst 0x4483432f // smlalb z15.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834732 // smlalt z18.s, p4/M, z25.h, z3.h\n" + "ld1b { z25.h }, p3/Z, [x24, x2]\n" + ".inst 0x44834315 // smlalb z21.s, p4/M, z24.h, z3.h\n" + ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n" + ".inst 0x4483470d // smlalt z13.s, p4/M, z24.h, z3.h\n" + ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n" + ".inst 0x448343c5 // smlalb z5.s, p4/M, z30.h, z3.h\n" + ".inst 0x448347c7 // smlalt z7.s, p4/M, z30.h, z3.h\n" + ".inst 0x44834386 // smlalb z6.s, p4/M, z28.h, z3.h\n" + ".inst 0x44834788 // smlalt z8.s, p4/M, z28.h, z3.h\n" + "ld1sb { z3.h }, p4/Z, [x1, #7, MUL VL]\n" + "inch x1, ALL, MUL #8\n" + ".inst 0x4484430f // smlalb z15.s, p4/M, z24.h, z4.h\n" + ".inst 0x44844712 // smlalt z18.s, p4/M, z24.h, z4.h\n" + "ld1b { z24.h }, p3/Z, [x23, x2]\n" + ".inst 0x448442d5 // smlalb z21.s, p4/M, z22.h, z4.h\n" + ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n" + ".inst 0x448446cd // smlalt z13.s, p4/M, z22.h, z4.h\n" + ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n" + ".inst 0x44844385 // smlalb z5.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844787 // smlalt z7.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844346 // smlalb z6.s, p4/M, z26.h, z4.h\n" + ".inst 0x44844748 // smlalt z8.s, p4/M, z26.h, z4.h\n" + "ld1sb { z4.h }, p4/Z, [x1]\n" + "inch x1\n" + ".inst 0x4480436f // smlalb z15.s, p4/M, z27.h, z0.h\n" + ".inst 0x44804772 // smlalt z18.s, p4/M, z27.h, z0.h\n" + "ld1b { z27.h }, p3/Z, [x22, x2]\n" + ".inst 0x448042f5 // smlalb z21.s, p4/M, z23.h, z0.h\n" + ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n" + ".inst 0x448046ed // smlalt z13.s, p4/M, z23.h, z0.h\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + ".inst 0x44804325 // smlalb z5.s, p4/M, z25.h, z0.h\n" + ".inst 0x44804727 // smlalt z7.s, p4/M, z25.h, z0.h\n" + "ld1b { z25.h }, p3/Z, [x21, x2]\n" + ".inst 0x44804306 // smlalb z6.s, p4/M, z24.h, z0.h\n" + ".inst 0x44804708 // smlalt z8.s, p4/M, z24.h, z0.h\n" + ".inst 0x448142ef // smlalb z15.s, p4/M, z23.h, z1.h\n" + ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n" + ".inst 0x448146f2 // smlalt z18.s, p4/M, z23.h, z1.h\n" + ".inst 0x448143f5 // smlalb z21.s, p4/M, z31.h, z1.h\n" + ".inst 0x448147ed // smlalt z13.s, p4/M, z31.h, z1.h\n" + ".inst 0x44814305 // smlalb z5.s, p4/M, z24.h, z1.h\n" + ".inst 0x44814707 // smlalt z7.s, p4/M, z24.h, z1.h\n" + "ld1b { z24.h }, p3/Z, [x20, x2]\n" + ".inst 0x44814366 // smlalb z6.s, p4/M, z27.h, z1.h\n" + ".inst 0x44814768 // smlalt z8.s, p4/M, z27.h, z1.h\n" + ".inst 0x448243ef // smlalb z15.s, p4/M, z31.h, z2.h\n" + ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n" + ".inst 0x448247f2 // smlalt z18.s, p4/M, z31.h, z2.h\n" + ".inst 0x448243d5 // smlalb z21.s, p4/M, z30.h, z2.h\n" + ".inst 0x448247cd // smlalt z13.s, p4/M, z30.h, z2.h\n" + ".inst 0x44824365 // smlalb z5.s, p4/M, z27.h, z2.h\n" + ".inst 0x44824767 // smlalt z7.s, p4/M, z27.h, z2.h\n" + "ld1b { z27.h }, p3/Z, [x19, x2]\n" + "inch x2\n" + ".inst 0x44824326 // smlalb z6.s, p4/M, z25.h, z2.h\n" + "whilelt p2.s, x2, x0\n" + ".inst 0x44824728 // smlalt z8.s, p4/M, z25.h, z2.h\n" + "mov x19, x2\n" + ".inst 0x448343cf // smlalb z15.s, p4/M, z30.h, z3.h\n" + "incw x19\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + "whilelt p1.s, x19, x0\n" + ".inst 0x448347d2 // smlalt z18.s, p4/M, z30.h, z3.h\n" + "whilelt p3.h, x2, x0\n" + ".inst 0x44834395 // smlalb z21.s, p4/M, z28.h, z3.h\n" + ".inst 0x4483478d // smlalt z13.s, p4/M, z28.h, z3.h\n" + ".inst 0x44834325 // smlalb z5.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834727 // smlalt z7.s, p4/M, z25.h, z3.h\n" + ".inst 0x44834306 // smlalb z6.s, p4/M, z24.h, z3.h\n" + ".inst 0x44834708 // smlalt z8.s, p4/M, z24.h, z3.h\n" + ".inst 0x4484438f // smlalb z15.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844792 // smlalt z18.s, p4/M, z28.h, z4.h\n" + ".inst 0x44844355 // smlalb z21.s, p4/M, z26.h, z4.h\n" + ".inst 0x4484474d // smlalt z13.s, p4/M, z26.h, z4.h\n" + ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n" + ".inst 0x04b47652 // sqrdmulh z18.s, z18.s, z20.s\n" + ".inst 0x04aa76b5 // sqrdmulh z21.s, z21.s, z10.s\n" + ".inst 0x04b475ad // sqrdmulh z13.s, z13.s, z20.s\n" + "and z28.d, z15.d, z29.d\n" + "and z26.d, z18.d, z19.d\n" + "and z16.d, z21.d, z29.d\n" + "asr z28.s, z28.s, #0x1f\n" + "asr z26.s, z26.s, #0x1f\n" + "asr z16.s, z16.s, #0x1f\n" + "sqadd z15.s, z15.s, z28.s\n" + "sqadd z18.s, z18.s, z26.s\n" + "sqadd z21.s, z21.s, z16.s\n" + "and z16.d, z13.d, z19.d\n" + ".inst 0x44844305 // smlalb z5.s, p4/M, z24.h, z4.h\n" + ".inst 0x44844707 // smlalt z7.s, p4/M, z24.h, z4.h\n" + "asr z16.s, z16.s, #0x1f\n" + ".inst 0x44844366 // smlalb z6.s, p4/M, z27.h, z4.h\n" + ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n" + "sqadd z13.s, z13.s, z16.s\n" + ".inst 0x04b474e7 // sqrdmulh z7.s, z7.s, z20.s\n" + ".inst 0x04aa74c6 // sqrdmulh z6.s, z6.s, z10.s\n" + "and z16.d, z5.d, z29.d\n" + ".inst 0x44844768 // smlalt z8.s, p4/M, z27.h, z4.h\n" + "and z25.d, z7.d, z19.d\n" + "asr z16.s, z16.s, #0x1f\n" + "and z26.d, z6.d, z29.d\n" + "asr z25.s, z25.s, #0x1f\n" + "sqadd z5.s, z5.s, z16.s\n" + "asr z26.s, z26.s, #0x1f\n" + "sqadd z7.s, z7.s, z25.s\n" + ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n" + "sqadd z6.s, z6.s, z26.s\n" + ".inst 0x448293af // srshl z15.s, p4/M, z15.s, z29.s\n" + ".inst 0x44829272 // srshl z18.s, p4/M, z18.s, z19.s\n" + "and z16.d, z8.d, z19.d\n" + ".inst 0x448293b5 // srshl z21.s, p4/M, z21.s, z29.s\n" + "add z15.s, z15.s, z17.s\n" + "add z18.s, z18.s, z17.s\n" + "asr z16.s, z16.s, #0x1f\n" + "add z21.s, z21.s, z17.s\n" + "smin z15.s, p4/M, z15.s, z11.s\n" + "sqadd z8.s, z8.s, z16.s\n" + "smin z18.s, p4/M, z18.s, z11.s\n" + "smin z21.s, p4/M, z21.s, z11.s\n" + "smax z15.s, p4/M, z15.s, z12.s\n" + ".inst 0x4482926d // srshl z13.s, p4/M, z13.s, z19.s\n" + "smax z18.s, p4/M, z18.s, z12.s\n" + "smax z21.s, p4/M, z21.s, z12.s\n" + ".inst 0x448293a5 // srshl z5.s, p4/M, z5.s, z29.s\n" + "add z13.s, z13.s, z17.s\n" + "trn1 z15.h, z15.h, z18.h\n" + "st1b { z15.h }, p0, [x7, x3]\n" + "add z5.s, z5.s, z17.s\n" + "smin z13.s, p4/M, z13.s, z11.s\n" + ".inst 0x44829267 // srshl z7.s, p4/M, z7.s, z19.s\n" + ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n" + "smin z5.s, p4/M, z5.s, z11.s\n" + "smax z13.s, p4/M, z13.s, z12.s\n" + "add z7.s, z7.s, z17.s\n" + "add z6.s, z6.s, z17.s\n" + "smax z5.s, p4/M, z5.s, z12.s\n" + "trn1 z21.h, z21.h, z13.h\n" + "st1b { z21.h }, p0, [x8, x3]\n" + "smin z7.s, p4/M, z7.s, z11.s\n" + "smin z6.s, p4/M, z6.s, z11.s\n" + ".inst 0x44829268 // srshl z8.s, p4/M, z8.s, z19.s\n" + "smax z7.s, p4/M, z7.s, z12.s\n" + "smax z6.s, p4/M, z6.s, z12.s\n" + "add z8.s, z8.s, z17.s\n" + "trn1 z5.h, z5.h, z7.h\n" + "st1b { z5.h }, p0, [x17, x3]\n" + "smin z8.s, p4/M, z8.s, z11.s\n" + "smax z8.s, p4/M, z8.s, z12.s\n" + "trn1 z6.h, z6.h, z8.h\n" + "st1b { z6.h }, p0, [x16, x3]\n" + "inch x3\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ld1w { z4.s }, p2/Z, [x19]\n" + "ld1w { z16.s }, p1/Z, [x19, #1, MUL VL]\n" + "uzp1 z15.s, z4.s, z16.s\n" + "addvl x19, x19, #2\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "uzp2 z18.s, z4.s, z16.s\n" + "mov z21.d, z15.d\n" + "ld1sb { z0.h }, p4/Z, [x1]\n" + "mov z5.d, z15.d\n" + "ld1sb { z1.h }, p4/Z, [x1, #1, MUL VL]\n" + "mov z13.d, z18.d\n" + "ld1sb { z2.h }, p4/Z, [x1, #2, MUL VL]\n" + "mov z7.d, z18.d\n" + "ld1sb { z3.h }, p4/Z, [x1, #3, MUL VL]\n" + "mov z6.d, z15.d\n" + "ld1sb { z4.h }, p4/Z, [x1, #4, MUL VL]\n" + "mov z8.d, z18.d\n" + "ldp x28, x27, [x5, #0x0]\n" + ".inst 0x454e1000 // ssublb z0.h, z0.b, z14.b\n" + "ldp x26, x25, [x5, #0x10]\n" + ".inst 0x454e1021 // ssublb z1.h, z1.b, z14.b\n" + ".inst 0x454e1042 // ssublb z2.h, z2.b, z14.b\n" + "ldp x24, x23, [x5, #0x20]\n" + ".inst 0x454e1063 // ssublb z3.h, z3.b, z14.b\n" + ".inst 0x454e1084 // ssublb z4.h, z4.b, z14.b\n" + "ldp x22, x21, [x5, #0x30]\n" + "ldp x20, x19, [x5, #0x40]\n" + "ld1b { z31.h }, p3/Z, [x28, x2]\n" + ".inst 0x45491bff // usublb z31.h, z31.b, z9.b\n" + "ld1b { z30.h }, p3/Z, [x27, x2]\n" + "ld1b { z29.h }, p3/Z, [x26, x2]\n" + ".inst 0x45491bde // usublb z30.h, z30.b, z9.b\n" + "ld1b { z28.h }, p3/Z, [x25, x2]\n" + ".inst 0x45491bbd // usublb z29.h, z29.b, z9.b\n" + "ld1b { z27.h }, p3/Z, [x24, x2]\n" + "ld1b { z23.h }, p3/Z, [x23, x2]\n" + ".inst 0x45491b9c // usublb z28.h, z28.b, z9.b\n" + "ld1b { z25.h }, p3/Z, [x22, x2]\n" + "ld1b { z24.h }, p3/Z, [x21, x2]\n" + ".inst 0x45491b7b // usublb z27.h, z27.b, z9.b\n" + ".inst 0x45491af7 // usublb z23.h, z23.b, z9.b\n" + "ld1b { z26.h }, p3/Z, [x20, x2]\n" + "ld1b { z22.h }, p3/Z, [x19, x2]\n" + ".inst 0x45491b39 // usublb z25.h, z25.b, z9.b\n" + ".inst 0x45491b18 // usublb z24.h, z24.b, z9.b\n" + ".inst 0x45491b5a // usublb z26.h, z26.b, z9.b\n" + ".inst 0x45491ad6 // usublb z22.h, z22.b, z9.b\n" + "b.any 1b\n" + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(SVE2) diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index 6d483a3b9d..1269ef62a6 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -175,6 +175,18 @@ inline unsigned long get_vector_length() { #endif } +template +inline unsigned long get_vector_length(VLType vl_type) { + switch (vl_type) { +#ifdef __ARM_FEATURE_SVE + case VLType::SVE: + return get_vector_length_sz(); +#endif + default: + return 16 / sizeof(T); + } +} + } // utils namespace } // arm_gemm namespace diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h deleted file mode 100644 index a956898403..0000000000 --- a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H -#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H - -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "src/core/NEON/INEKernel.h" - -#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** This class is a wrapper for the depthwise convolution assembly kernels. */ -class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel -{ -public: - const char *name() const override - { - return "NEDepthwiseConvolutionAssemblyKernelWrapper"; - } - - /** Default constructor */ - NEDepthwiseConvolutionAssemblyKernelWrapper() - : _kernel(nullptr) - { - } - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete; - /** Default Move Constructor. */ - NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default; - /** Default move assignment operator */ - NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default; - - /** Initialise the kernel's input and output. - * - * @param[in] kernel Pointer to an assembly kernel implementation. - */ - void configure(depthwise::IDepthwiseConvolution *kernel) - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(kernel))); - _kernel = kernel; - Window win; - win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1)); - INEKernel::configure(win); - } - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override - { - ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast(_kernel))); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - auto first = window.x().start(); - auto last = window.x().end(); - _kernel->run(first, last, info.thread_id); - } - -private: - depthwise::IDepthwiseConvolution *_kernel; -}; -} // namespace arm_compute -#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/core/NEON/kernels/assembly/common.hpp b/src/core/NEON/kernels/assembly/common.hpp new file mode 100644 index 0000000000..d82d11cae0 --- /dev/null +++ b/src/core/NEON/kernels/assembly/common.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +namespace arm_conv +{ +struct PaddingValues +{ + unsigned int left, top, right, bottom; +}; + +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp new file mode 100644 index 0000000000..eadf48d003 --- /dev/null +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "arm_gemm.hpp" +#include "arm_gemm_local.hpp" +#include "depthwise_common.hpp" + +namespace arm_conv +{ +namespace depthwise +{ +struct DepthwiseConfig +{ + DepthwiseMethod method = DepthwiseMethod::DEFAULT; + std::string filter = ""; + + DepthwiseConfig(DepthwiseMethod method) + : method(method) {}; + DepthwiseConfig() {}; +}; + +struct DepthwiseArgs +{ + const CPUInfo *cpu_info; + + unsigned int kernel_rows, kernel_cols; + unsigned int stride_rows, stride_cols; + + unsigned int n_batches, input_rows, input_cols, input_channels; + unsigned int output_rows, output_cols; + unsigned int channel_multiplier; + + PaddingValues padding; + + arm_gemm::Activation activation; + + const DepthwiseConfig *config; + + DepthwiseArgs( + const CPUInfo *cpu_info, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, arm_gemm::Activation activation, + const DepthwiseConfig *config) + : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), stride_rows(stride_rows), stride_cols(stride_cols), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), + input_channels(input_channels), output_rows(output_rows), output_cols(output_cols), channel_multiplier(channel_multiplier), padding(padding), activation(activation), config(config) + { + } +}; + +template +class DepthwiseCommon : public IDepthwiseCommon +{ +protected: + const DepthwiseArgs m_args; // Copy of arguments + +public: + DepthwiseCommon(const DepthwiseArgs &args) + : m_args(args) {}; + DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon &operator=(DepthwiseCommon &) = delete; + + void execute( + const void *const input, + const void *const parameters, + void *const output, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override + { + const size_t ld_input_col = m_args.input_channels; + const size_t ld_input_row = ld_input_col * m_args.input_cols; + const size_t ld_input_batch = ld_input_row * m_args.input_rows; + const size_t ld_output_col = m_args.input_channels * m_args.channel_multiplier; + const size_t ld_output_row = ld_output_col * m_args.output_cols; + const size_t ld_output_batch = ld_output_row * m_args.output_rows; + + execute( + input, ld_input_col, ld_input_row, ld_input_batch, + parameters, output, ld_output_col, ld_output_row, ld_output_batch, + working_space, thread_id, n_threads); + } + + void execute( + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *const parameters, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override + { + execute( + m_args.n_batches, m_args.input_rows, m_args.input_cols, + m_args.input_channels, m_args.padding, + input, ld_input_col, ld_input_row, ld_input_batch, + parameters, + m_args.output_rows, m_args.output_cols, + output, ld_output_col, ld_output_row, ld_output_batch, + working_space, thread_id, n_threads); + } + + virtual void execute( + unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override = 0; +}; + +template +using UniqueDepthwiseCommon = std::unique_ptr>; + +template +KernelDescription get_depthwise_method(const DepthwiseArgs &, const OutputStage & = {}); + +template +UniqueDepthwiseCommon depthwise(const DepthwiseArgs &, const OutputStage & = {}); + +template +std::vector get_compatible_kernels(const DepthwiseArgs &, const OutputStage & = {}); + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp new file mode 100644 index 0000000000..52963ab357 --- /dev/null +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "arm_gemm.hpp" +#include "common.hpp" + +namespace arm_conv +{ +namespace depthwise +{ +using arm_gemm::Nothing; + +enum class DepthwiseMethod +{ + DEFAULT, + DEPTHFIRST, + PLANAR, +}; + +struct KernelDescription +{ + DepthwiseMethod method = DepthwiseMethod::DEFAULT; + std::string name = ""; + bool is_default = false; + uint64_t cycle_estimate = 0; + + KernelDescription( + DepthwiseMethod method, + std::string name, + bool is_default, + uint64_t cycle_estimate) + : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate) + { + } + + KernelDescription() noexcept {}; +}; + +class IDepthwiseCommon +{ +public: + virtual ~IDepthwiseCommon() = default; + + // Determine the amount of storage space required for the rearranged weights + // and bias. + virtual size_t get_storage_size(void) const = 0; + + // Rearrange the weights and biases into a storage buffer. + // Accepts a pointer to a buffer into which to store the packed parameters, a + // pointer the bias vector (which may be nullptr in the case of no bias) and + // a pointer to the array of weights (stored in HWIO order). + virtual void pack_parameters( + void *buffer, + const void *biases, + const void *weights, + size_t ld_weight_col = 0, + size_t ld_weight_row = 0) = 0; + + // Determine the amount of working space required + virtual size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const = 0; + + // Execute the convolution over the specified area of memory. + virtual void execute( + const void *input, // Pointer to input tensor + const void *parameters, // Packed parameters buffer + void *output, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute( + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute( + unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp index fdc18aef39..b6a0a0abed 100644 --- a/src/core/NEON/kernels/assembly/pool_common.hpp +++ b/src/core/NEON/kernels/assembly/pool_common.hpp @@ -24,9 +24,7 @@ #pragma once -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif // CYCLE_PROFILING +#include "common.hpp" namespace arm_conv { @@ -55,11 +53,6 @@ struct PoolingStride unsigned int rows, cols; }; -struct PaddingValues -{ - unsigned int left, top, right, bottom; -}; - class IPoolingCommon { public: diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp deleted file mode 100644 index 70d6689731..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp +++ /dev/null @@ -1,551 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include -#include "activation.hpp" -#include "padding.hpp" - -namespace depthwise -{ - -namespace nck = neon_convolution_kernels; - -class IDepthwiseConvolution -{ - public: - virtual ~IDepthwiseConvolution() = default; - - virtual int output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after - ) const = 0; - - /* Set input tensor and stride. */ - virtual void set_input(const void *inptr) = 0; - virtual void set_input(const void *inptr, int column_stride) = 0; - virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0; - virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0; - - /* Set output tensor and stride. */ - virtual void set_output(void *outptr) = 0; - virtual void set_output(void *outptr, int column_stride) = 0; - virtual void set_output(void *outptr, int row_stride, int column_stride) = 0; - virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0; - - /* Weights and biases are re-ordered to improve memory access patterns. Use - * these methods to determine the size of the re-pack buffer and to set the - * address (and implicitly reorder the weights and biases into) the buffer. - */ - virtual size_t get_packed_params_size(void) const = 0; - virtual void set_packed_params_buffer(void *) = 0; - - virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0; - virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0; - virtual void pack_params( - void *buffer, - const void* weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const = 0; - - /* Working space is used to pad tensors on the fly. Before running any - * inference check the amount of space required, allocate and provide a - * pointer to the convolution engine. - */ - virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; - virtual void set_working_space(void *) = 0; - - virtual unsigned int get_window(void) const = 0; - virtual void run( - unsigned int start, - unsigned int stop, - unsigned int threadid=0 - ) = 0; -}; - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename TIn, typename TBias, typename TOut, - typename Derived -> -class DepthwiseConvolutionBase : public IDepthwiseConvolution -{ - public: - // Information about the specific convolution instance - using InputType = TIn; - using BiasType = TBias; - using OutputType = TOut; - static constexpr int output_tile_rows = OutputTileRows; - static constexpr int output_tile_cols = OutputTileCols; - static constexpr int kernel_rows = KernelRows; - static constexpr int kernel_cols = KernelCols; - static constexpr int stride_rows = StrideRows; - static constexpr int stride_cols = StrideCols; - static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows; - static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols; - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - */ - DepthwiseConvolutionBase( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - /** Create a new depthwise convolution engine. - * - * @param[in] n_batches Number of batches tensors. - * @param[in] n_input_rows Number of rows in input tensor. - * @param[in] n_input_cols Number of columns in input tensor. - * @param[in] n_channels Number of channels in input and output tensors. - */ - DepthwiseConvolutionBase( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - // Cannot copy or move a DepthwiseConvolution. - DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete; - DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete; - - /* Set input tensor and stride. */ - void set_input(const void *inptr) override; - void set_input(const void *inptr, int column_stride) override; - void set_input(const void *inptr, int row_stride, int column_stride) override; - void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; - - /* Set output tensor and stride. */ - void set_output(void *outptr) override; - void set_output(void *outptr, int column_stride) override; - void set_output(void *outptr, int row_stride, int column_stride) override; - void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; - - /** Get the number of output rows/columns. - * - * @param[in] dim_size Number of elements in the dimension (rows/columns) - * @param[in] same_padding True if the padding is SAME, otherwise false. - */ - static int get_output_size( - int dim_size, unsigned int padding_before, unsigned int padding_after - ); - - int output_size( - int dim_size, unsigned int padding_before, unsigned int padding_after - ) const override; - - /* Determine how much memory is required to store the packed weights and - * biases. - */ - size_t get_packed_params_size(void) const override; - - /* Set the buffer for the packed weights and biases, and perform the - * packing. - */ - void set_packed_params_buffer(void *buffer) override; - - void pack_params(const void *weights, const void *biases=nullptr) const override; - - void pack_params( - void *buffer, - const void *weights, - const void *biases=nullptr - ) const override; - - void pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const override; - - /** Query the amount of working space required. - * @param[in] The largest number of threads which will be used to execute - * the kernel. - */ - size_t get_working_space_size(unsigned int n_threads=1) const override; - - /** Set the working space buffer. - */ - void set_working_space(void *buffer) override; - - /** Get the window of work to be performed by an instance of the operator. - */ - unsigned int get_window(void) const override; - - /** Perform a portion of the work associated with the operator. - * - * Will perform the window of work described by $[start, stop)$. - * - * @param[in] start Start of the window of work to perform. - * @param[in] stop End of the work to perform. - * @param[in] ID of the thread performing the work. - */ - void run( - unsigned int start, - unsigned int stop, - unsigned int threadid=0 - ) override; - - protected: - /** Get the value to use to pad the tensor. - */ - TIn _input_padding_value(void) const; - - /** Implementation of the parameter packing. - */ - void _pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const; - - /** Process a tile-row of the tensors. - */ - void process_tile_row( - unsigned int threadid, - int n_channels, - const void* packed_params, - const InputType* inptr, - OutputType* outptr, - int row_pad_in_top, - int row_pad_in_left, - int row_pad_in_bottom, - int row_pad_out_bottom, - int n_tiles, - int n_input_cols, - int n_output_cols - ); - - /** Process a single tile of the tensor. - * - * This method will apply input/output padding (if required) and call the - * depthwise tile implementation. - */ - void process_tile( - unsigned int threadid, - int n_channels, - const void* packed_params, - const InputType* inptr, - OutputType* outptr, - int pad_in_top, - int pad_in_left, - int pad_in_bottom, - int pad_in_right, - int pad_out_bottom, - int pad_out_right - ); - - /** Perform depthwise convolution on a single tile. - */ - template - void execute_tile( - int n_channels, - const void* packed_params, - const InputType* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - OutputType* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template - void execute_tile( - int n_channels, - const void* packed_params, - const InputType* inptrs[inner_tile_rows][inner_tile_cols], - OutputType* outptrs[output_tile_rows][output_tile_cols] - ); - - int n_channels(void) const; - - private: - // Member variables of instances of a convolution engine. - const InputType* _input; - OutputType* _output; - void* _packed_parameters; - void* _working_space; // Per-thread working space - const int _n_batches, _n_input_rows, _n_input_cols, _n_channels, - _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols; - const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right; - const nck::ActivationFunction _activation; - - // Stride information for a convolution instance - int _input_col_stride, _input_row_stride, _input_batch_stride; - int _output_col_stride, _output_row_stride, _output_batch_stride; - - // Methods for getting access to working space - size_t _get_input_working_space_size(void) const; - size_t _get_output_working_space_size(void) const; - - void *_get_input_working_space(unsigned int threadid) const; - void *_get_output_working_space(unsigned int threadid) const; -}; - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename TIn, typename TBias, typename TOut -> -class DepthwiseConvolution : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut - > -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut - > >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - using Base::DepthwiseConvolutionBase; - - protected: - template - void execute_tile( - int n_channels, - const void* packed_params, - const TIn* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - TOut* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template - void execute_tile( - int n_channels, - const void* packed_params, - const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); -}; - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float -> : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float - > -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float, float, float - > >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - protected: - template - void execute_tile( - int n_channels, - const void* packed_params, - const float* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - float* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template - void execute_tile( - int n_channels, - const void* packed_params, - const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); -}; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t -> : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t - > -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t, - DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - float16_t, float16_t, float16_t - > >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - protected: - template - void execute_tile( - int n_channels, - const void* packed_params, - const float16_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - float16_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template - void execute_tile( - int n_channels, - const void* packed_params, - const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); -}; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp deleted file mode 100644 index 864c6e24a0..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp +++ /dev/null @@ -1,1168 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x26, %[inptr0], %[input_row_stride]\n" - "add x21, %[input_col_stride1], %[input_col_stride1]\n" - "add x23, %[outptr0], %[output_row_stride]\n" - "add x27, x26, %[input_row_stride]\n" - "add x22, x21, %[input_col_stride1]\n" - "and x24, %[n_channels], #3\n" - "add x28, x27, %[input_row_stride]\n" - "lsr x25, %[n_channels], #2\n" - "cbz x25, 4f\n" - "1:\n" - "ldr q15, [%[wbptr]]\n" - "subs x25, x25, #1\n" - "mov v3.16b, v15.16b\n" - "ldr q14, [%[wbptr], #16]\n" - "mov v1.16b, v15.16b\n" - "ldr q13, [%[wbptr], #32]\n" - "mov v2.16b, v15.16b\n" - "ldr q12, [%[wbptr], #48]\n" - "mov v0.16b, v15.16b\n" - "ldr q11, [%[wbptr], #64]\n" - "ldr q10, [%[wbptr], #80]\n" - "ldr q9, [%[wbptr], #96]\n" - "ldr q8, [%[wbptr], #112]\n" - "ldr q7, [%[wbptr], #128]\n" - "ldr q6, [%[wbptr], #144]\n" - "ldr q24, [%[inptr0]]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr q22, [x26]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr q18, [x27]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr q21, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "ldr q17, [%[inptr0], x21]\n" - "ldr q20, [x28]\n" - "ldr q5, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr q19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr q23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr q22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr q21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr q16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr q20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "ldr q15, [%[wbptr]]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "ldr q14, [%[wbptr], #16]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "ldr q11, [%[wbptr], #64]\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "ldr q24, [%[inptr0]]\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "ldr q17, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "ldr q13, [%[wbptr], #32]\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "add x26, x26, #16\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "ldr q22, [x26]\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "ldr q8, [%[wbptr], #112]\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "ldr q21, [x26, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "add x27, x27, #16\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "ldr q10, [%[wbptr], #80]\n" - "str q1, [x23]\n" - "mov v3.16b, v15.16b\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "ldr q18, [x27]\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "ldr q12, [%[wbptr], #48]\n" - "mov v1.16b, v15.16b\n" - "ldr q5, [x27, %[input_col_stride1]]\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "ldr q7, [%[wbptr], #128]\n" - "mov v2.16b, v15.16b\n" - "add x28, x28, #16\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr q20, [x28]\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "ldr q9, [%[wbptr], #96]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "subs x25, x25, #1\n" - "str q0, [x23, %[output_col_stride1]]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr q6, [%[wbptr], #144]\n" - "add x23, x23, #16\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "mov v0.16b, v15.16b\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr q19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr q23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr q22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr q21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr q16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr q20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "add x26, x26, #16\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add x27, x27, #16\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "add x28, x28, #16\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "str q1, [x23]\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "str q0, [x23, %[output_col_stride1]]\n" - "add x23, x23, #16\n" - "4:\n" - "cbz x24, 7f\n" - "ldr s15, [%[wbptr]]\n" - "mov v3.16b, v15.16b\n" - "ldr s14, [%[wbptr], #4]\n" - "mov v1.16b, v15.16b\n" - "ldr s13, [%[wbptr], #8]\n" - "mov v2.16b, v15.16b\n" - "ldr s12, [%[wbptr], #12]\n" - "mov v0.16b, v15.16b\n" - "ldr s11, [%[wbptr], #16]\n" - "ldr s10, [%[wbptr], #20]\n" - "subs x24, x24, #1\n" - "ldr s9, [%[wbptr], #24]\n" - "ldr s8, [%[wbptr], #28]\n" - "ldr s7, [%[wbptr], #32]\n" - "ldr s6, [%[wbptr], #36]\n" - "ldr s24, [%[inptr0]]\n" - "ldr s22, [x26]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "ldr s18, [x27]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr s21, [x26, %[input_col_stride1]]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr s17, [%[inptr0], x21]\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "ldr s20, [x28]\n" - "ldr s5, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr s19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr s23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr s22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr s21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr s16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr s20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "ldr s15, [%[wbptr]]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "ldr s14, [%[wbptr], #4]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "ldr s11, [%[wbptr], #16]\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "ldr s24, [%[inptr0]]\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "ldr s17, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "ldr s13, [%[wbptr], #8]\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "add x26, x26, #4\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "ldr s22, [x26]\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "ldr s8, [%[wbptr], #28]\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "ldr s21, [x26, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "add x27, x27, #4\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "ldr s10, [%[wbptr], #20]\n" - "str s1, [x23]\n" - "mov v3.16b, v15.16b\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "ldr s18, [x27]\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "ldr s12, [%[wbptr], #12]\n" - "mov v1.16b, v15.16b\n" - "ldr s5, [x27, %[input_col_stride1]]\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "ldr s7, [%[wbptr], #32]\n" - "mov v2.16b, v15.16b\n" - "add x28, x28, #4\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "ldr s20, [x28]\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "ldr s9, [%[wbptr], #24]\n" - "fmla v1.4s, v22.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v3.4s, v19.4s, v13.4s\n" - "subs x24, x24, #1\n" - "str s0, [x23, %[output_col_stride1]]\n" - "fmla v2.4s, v19.4s, v14.4s\n" - "ldr s6, [%[wbptr], #36]\n" - "add x23, x23, #4\n" - "fmla v3.4s, v18.4s, v8.4s\n" - "fmla v1.4s, v18.4s, v11.4s\n" - "mov v0.16b, v15.16b\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v21.4s, v10.4s\n" - "ldr s19, [x26, x21]\n" - "fmla v1.4s, v21.4s, v13.4s\n" - "ldr s23, [%[inptr0], x22]\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "ldr s22, [x28, %[input_col_stride1]]\n" - "fmla v0.4s, v21.4s, v14.4s\n" - "ldr s21, [x27, x21]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s18, [x26, x22]\n" - "fmla v2.4s, v17.4s, v13.4s\n" - "ldr s16, [x28, x21]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "ldr s20, [x27, x22]\n" - "fmla v3.4s, v5.4s, v7.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v2.4s, v5.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v5.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v5.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "add x26, x26, #4\n" - "fmla v1.4s, v19.4s, v12.4s\n" - "add x27, x27, #4\n" - "fmla v2.4s, v19.4s, v10.4s\n" - "add x28, x28, #4\n" - "fmla v0.4s, v19.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v22.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v21.4s, v10.4s\n" - "fmla v1.4s, v16.4s, v6.4s\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v12.4s\n" - "str s1, [x23]\n" - "fmla v2.4s, v20.4s, v6.4s\n" - "fmla v0.4s, v16.4s, v7.4s\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v20.4s, v9.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v4.4s, v6.4s\n" - "str s0, [x23, %[output_col_stride1]]\n" - "add x23, x23, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x21, %[inptr0], %[input_row_stride]\n" - "add x24, %[input_col_stride1], %[input_col_stride1]\n" - "add x22, %[outptr0], %[output_row_stride]\n" - "add x23, x21, %[input_row_stride]\n" - "add x27, x24, %[input_col_stride1]\n" - "and x25, %[n_channels], #3\n" - "add x28, x23, %[input_row_stride]\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q11, [%[wbptr]]\n" - "subs x26, x26, #1\n" - "mov v17.16b, v11.16b\n" - "ldr q13, [%[wbptr], #16]\n" - "mov v15.16b, v11.16b\n" - "ldr q4, [%[wbptr], #32]\n" - "mov v16.16b, v11.16b\n" - "ldr q2, [%[wbptr], #48]\n" - "mov v14.16b, v11.16b\n" - "ldr q5, [%[wbptr], #64]\n" - "ldr q10, [%[wbptr], #80]\n" - "ldr q1, [%[wbptr], #96]\n" - "ldr q12, [%[wbptr], #112]\n" - "ldr q0, [%[wbptr], #128]\n" - "ldr q3, [%[wbptr], #144]\n" - "ldr q6, [%[inptr0]]\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "ldr q27, [x21]\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr q24, [x23]\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "ldr q22, [x21, %[input_col_stride1]]\n" - "ldr q9, [%[inptr0], x24]\n" - "ldr q8, [x28]\n" - "ldr q20, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr q26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr q25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr q24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr q21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr q7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr q19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr q18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "ldr q11, [%[wbptr]]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "ldr q13, [%[wbptr], #16]\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "ldr q6, [%[inptr0]]\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "ldr q5, [%[wbptr], #64]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr q9, [%[inptr0], x24]\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "add x21, x21, #16\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "ldr q27, [x21]\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "ldr q4, [%[wbptr], #32]\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "ldr q22, [x21, %[input_col_stride1]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "add x23, x23, #16\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "ldr q12, [%[wbptr], #112]\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "ldr q24, [x23]\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "ldr q20, [x23, %[input_col_stride1]]\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "add x28, x28, #16\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "ldr q10, [%[wbptr], #80]\n" - "movi v26.16b, #0\n" - "ldr q8, [x28]\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "subs x26, x26, #1\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "ldr q2, [%[wbptr], #48]\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str q17, [%[outptr0]]\n" - "str q16, [%[outptr0], %[output_col_stride1]]\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str q15, [x22]\n" - "mov v17.16b, v11.16b\n" - "mov v15.16b, v11.16b\n" - "ldr q0, [%[wbptr], #128]\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "ldr q1, [%[wbptr], #96]\n" - "mov v16.16b, v11.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "ldr q3, [%[wbptr], #144]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "str q14, [x22, %[output_col_stride1]]\n" - "mov v14.16b, v11.16b\n" - "add x22, x22, #16\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr q26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr q25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr q24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr q21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr q7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr q19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr q18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "add x21, x21, #16\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add x23, x23, #16\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "add x28, x28, #16\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "movi v26.16b, #0\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "str q17, [%[outptr0]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "str q15, [x22]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str q16, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "str q14, [x22, %[output_col_stride1]]\n" - "add x22, x22, #16\n" - "4:\n" - "cbz x25, 7f\n" - "ldr s11, [%[wbptr]]\n" - "mov v17.16b, v11.16b\n" - "ldr s13, [%[wbptr], #4]\n" - "mov v15.16b, v11.16b\n" - "ldr s4, [%[wbptr], #8]\n" - "mov v16.16b, v11.16b\n" - "ldr s2, [%[wbptr], #12]\n" - "mov v14.16b, v11.16b\n" - "ldr s5, [%[wbptr], #16]\n" - "ldr s10, [%[wbptr], #20]\n" - "subs x25, x25, #1\n" - "ldr s1, [%[wbptr], #24]\n" - "ldr s12, [%[wbptr], #28]\n" - "ldr s0, [%[wbptr], #32]\n" - "ldr s3, [%[wbptr], #36]\n" - "ldr s6, [%[inptr0]]\n" - "ldr s27, [x21]\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "ldr s24, [x23]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr s22, [x21, %[input_col_stride1]]\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "ldr s9, [%[inptr0], x24]\n" - "ldr s8, [x28]\n" - "ldr s20, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr s26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr s25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr s24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr s21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr s7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr s19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr s18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "ldr s11, [%[wbptr]]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "ldr s13, [%[wbptr], #4]\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "ldr s6, [%[inptr0]]\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "ldr s5, [%[wbptr], #16]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr s9, [%[inptr0], x24]\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "add x21, x21, #4\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "ldr s27, [x21]\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "ldr s4, [%[wbptr], #8]\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "ldr s22, [x21, %[input_col_stride1]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "add x23, x23, #4\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "ldr s12, [%[wbptr], #28]\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "ldr s24, [x23]\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "ldr s20, [x23, %[input_col_stride1]]\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "add x28, x28, #4\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "ldr s10, [%[wbptr], #20]\n" - "movi v26.16b, #0\n" - "ldr s8, [x28]\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "subs x25, x25, #1\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "ldr s2, [%[wbptr], #12]\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str s17, [%[outptr0]]\n" - "str s16, [%[outptr0], %[output_col_stride1]]\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str s15, [x22]\n" - "mov v17.16b, v11.16b\n" - "mov v15.16b, v11.16b\n" - "ldr s0, [%[wbptr], #32]\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "ldr s1, [%[wbptr], #24]\n" - "mov v16.16b, v11.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v17.4s, v6.4s, v13.4s\n" - "fmla v15.4s, v27.4s, v13.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "ldr s3, [%[wbptr], #36]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "str s14, [x22, %[output_col_stride1]]\n" - "mov v14.16b, v11.16b\n" - "add x22, x22, #4\n" - "fmla v17.4s, v27.4s, v5.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v24.4s, v12.4s\n" - "ldr s26, [x21, x24]\n" - "fmla v15.4s, v24.4s, v5.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v16.4s, v22.4s, v5.4s\n" - "ldr s25, [x28, %[input_col_stride1]]\n" - "fmla v17.4s, v22.4s, v10.4s\n" - "ldr s24, [x23, x24]\n" - "fmla v15.4s, v22.4s, v4.4s\n" - "ldr s21, [x21, x27]\n" - "fmla v14.4s, v22.4s, v13.4s\n" - "ldr s7, [x28, x24]\n" - "fmla v17.4s, v9.4s, v2.4s\n" - "ldr s19, [x23, x27]\n" - "fmla v16.4s, v9.4s, v4.4s\n" - "ldr s18, [x28, x27]\n" - "fmla v15.4s, v8.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v20.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v20.4s, v12.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v15.4s, v20.4s, v10.4s\n" - "add x21, x21, #4\n" - "fmla v14.4s, v20.4s, v5.4s\n" - "add x23, x23, #4\n" - "fmla v17.4s, v26.4s, v1.4s\n" - "add x28, x28, #4\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "movi v26.16b, #0\n" - "fmla v17.4s, v24.4s, v3.4s\n" - "fmla v16.4s, v27.4s, v2.4s\n" - "fmla v15.4s, v25.4s, v0.4s\n" - "fmla v14.4s, v25.4s, v12.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v16.4s, v24.4s, v0.4s\n" - "str s17, [%[outptr0]]\n" - "fmla v15.4s, v24.4s, v1.4s\n" - "fmla v14.4s, v24.4s, v10.4s\n" - "fmla v16.4s, v21.4s, v1.4s\n" - "fmla v15.4s, v7.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v2.4s\n" - "fmla v16.4s, v19.4s, v3.4s\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "fmla v14.4s, v7.4s, v0.4s\n" - "str s15, [x22]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "fmla v14.4s, v19.4s, v1.4s\n" - "str s16, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v14.4s, v18.4s, v3.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "str s14, [x22, %[output_col_stride1]]\n" - "add x22, x22, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x21, %[inptr0], %[input_row_stride]\n" - "add x23, %[input_col_stride1], %[input_col_stride1]\n" - "add x24, %[outptr0], %[output_row_stride]\n" - "add x27, x21, %[input_row_stride]\n" - "add x22, x23, %[input_col_stride1]\n" - "and x25, %[n_channels], #3\n" - "add x28, x27, %[input_row_stride]\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q19, [%[wbptr]]\n" - "subs x26, x26, #1\n" - "mov v3.16b, v19.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v1.16b, v19.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v2.16b, v19.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "mov v0.16b, v19.16b\n" - "ldr q13, [%[wbptr], #64]\n" - "ldr q23, [%[wbptr], #80]\n" - "ldr q15, [%[wbptr], #96]\n" - "ldr q20, [%[wbptr], #112]\n" - "ldr q21, [%[wbptr], #128]\n" - "ldr q14, [%[wbptr], #144]\n" - "ldr q16, [%[inptr0]]\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "ldr q28, [x21]\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr q24, [x27]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr q8, [x21, %[input_col_stride1]]\n" - "ldr q9, [%[inptr0], x23]\n" - "ldr q18, [x28]\n" - "ldr q6, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr q25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr q28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr q24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr q27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr q7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr q17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr q5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "ldr q19, [%[wbptr]]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "ldr q16, [%[inptr0]]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "ldr q13, [%[wbptr], #64]\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "ldr q9, [%[inptr0], x23]\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "add x21, x21, #16\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr q28, [x21]\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "ldr q8, [x21, %[input_col_stride1]]\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "add x27, x27, #16\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "ldr q20, [%[wbptr], #112]\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "ldr q24, [x27]\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "ldr q6, [x27, %[input_col_stride1]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "add x28, x28, #16\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "ldr q23, [%[wbptr], #80]\n" - "movi v25.16b, #0\n" - "ldr q18, [x28]\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "subs x26, x26, #1\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "ldr q10, [%[wbptr], #48]\n" - "fmov v26.4s, #6.0\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "ldr q21, [%[wbptr], #128]\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "str q3, [%[outptr0]]\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "str q1, [x24]\n" - "mov v3.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "ldr q15, [%[wbptr], #96]\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "ldr q14, [%[wbptr], #144]\n" - "mov v2.16b, v19.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "str q0, [x24, %[output_col_stride1]]\n" - "mov v0.16b, v19.16b\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "add x24, x24, #16\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr q25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr q28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr q24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr q27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr q7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr q17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr q5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr q4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "add x21, x21, #16\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add x27, x27, #16\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "add x28, x28, #16\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "movi v25.16b, #0\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmov v26.4s, #6.0\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "str q3, [%[outptr0]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "str q1, [x24]\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "str q0, [x24, %[output_col_stride1]]\n" - "add x24, x24, #16\n" - "4:\n" - "cbz x25, 7f\n" - "ldr s19, [%[wbptr]]\n" - "mov v3.16b, v19.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v1.16b, v19.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v2.16b, v19.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "mov v0.16b, v19.16b\n" - "ldr s13, [%[wbptr], #16]\n" - "ldr s23, [%[wbptr], #20]\n" - "subs x25, x25, #1\n" - "ldr s15, [%[wbptr], #24]\n" - "ldr s20, [%[wbptr], #28]\n" - "ldr s21, [%[wbptr], #32]\n" - "ldr s14, [%[wbptr], #36]\n" - "ldr s16, [%[inptr0]]\n" - "ldr s28, [x21]\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "ldr s24, [x27]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr s8, [x21, %[input_col_stride1]]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr s9, [%[inptr0], x23]\n" - "ldr s18, [x28]\n" - "ldr s6, [x27, %[input_col_stride1]]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr s25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr s28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr s24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr s27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr s7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr s17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr s5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "ldr s19, [%[wbptr]]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "ldr s16, [%[inptr0]]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "ldr s13, [%[wbptr], #16]\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "ldr s9, [%[inptr0], x23]\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "add x21, x21, #4\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr s28, [x21]\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "ldr s8, [x21, %[input_col_stride1]]\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "add x27, x27, #4\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "ldr s20, [%[wbptr], #28]\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "ldr s24, [x27]\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "ldr s6, [x27, %[input_col_stride1]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "add x28, x28, #4\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "ldr s23, [%[wbptr], #20]\n" - "movi v25.16b, #0\n" - "ldr s18, [x28]\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "subs x25, x25, #1\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "ldr s10, [%[wbptr], #12]\n" - "fmov v26.4s, #6.0\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "ldr s21, [%[wbptr], #32]\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "str s3, [%[outptr0]]\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "str s1, [x24]\n" - "mov v3.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "ldr s15, [%[wbptr], #24]\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "ldr s14, [%[wbptr], #36]\n" - "mov v2.16b, v19.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "fmla v3.4s, v16.4s, v12.4s\n" - "fmla v1.4s, v28.4s, v12.4s\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "str s0, [x24, %[output_col_stride1]]\n" - "mov v0.16b, v19.16b\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "add x24, x24, #4\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v24.4s, v20.4s\n" - "ldr s25, [x21, x23]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "ldr s28, [%[inptr0], x22]\n" - "fmla v2.4s, v8.4s, v13.4s\n" - "ldr s24, [x28, %[input_col_stride1]]\n" - "fmla v3.4s, v8.4s, v23.4s\n" - "ldr s27, [x27, x23]\n" - "fmla v1.4s, v8.4s, v11.4s\n" - "ldr s7, [x21, x22]\n" - "fmla v0.4s, v8.4s, v12.4s\n" - "ldr s17, [x28, x23]\n" - "fmla v3.4s, v9.4s, v10.4s\n" - "ldr s5, [x27, x22]\n" - "fmla v2.4s, v9.4s, v11.4s\n" - "ldr s4, [x28, x22]\n" - "fmla v1.4s, v18.4s, v20.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v3.4s, v6.4s, v21.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v2.4s, v6.4s, v20.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v6.4s, v23.4s\n" - "add x21, x21, #4\n" - "fmla v0.4s, v6.4s, v13.4s\n" - "add x27, x27, #4\n" - "fmla v3.4s, v25.4s, v15.4s\n" - "add x28, x28, #4\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v23.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "movi v25.16b, #0\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmov v26.4s, #6.0\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v21.4s\n" - "fmla v0.4s, v24.4s, v20.4s\n" - "fmax v3.4s, v3.4s, v25.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v2.4s, v27.4s, v21.4s\n" - "fmla v0.4s, v27.4s, v23.4s\n" - "fmin v3.4s, v3.4s, v26.4s\n" - "str s3, [%[outptr0]]\n" - "fmla v2.4s, v7.4s, v15.4s\n" - "fmla v0.4s, v7.4s, v10.4s\n" - "fmla v1.4s, v17.4s, v14.4s\n" - "fmla v2.4s, v5.4s, v14.4s\n" - "fmla v0.4s, v17.4s, v21.4s\n" - "fmax v1.4s, v1.4s, v25.4s\n" - "fmax v2.4s, v2.4s, v25.4s\n" - "fmla v0.4s, v5.4s, v15.4s\n" - "fmin v1.4s, v1.4s, v26.4s\n" - "fmin v2.4s, v2.4s, v26.4s\n" - "str s1, [x24]\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v4.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmax v0.4s, v0.4s, v25.4s\n" - "fmin v0.4s, v0.4s, v26.4s\n" - "str s0, [x24, %[output_col_stride1]]\n" - "add x24, x24, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp deleted file mode 100644 index 2554436172..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp +++ /dev/null @@ -1,2809 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x23, %[inptr0], %[input_row_stride]\n" - "add x19, %[input_col_stride1], %[input_col_stride1]\n" - "add x22, %[outptr0], %[output_row_stride]\n" - "add x24, x23, %[input_row_stride]\n" - "add x20, x19, %[input_col_stride1]\n" - "and x27, %[n_channels], #3\n" - "add x25, x24, %[input_row_stride]\n" - "add x21, x20, %[input_col_stride1]\n" - "lsr x28, %[n_channels], #2\n" - "add x26, x25, %[input_row_stride]\n" - "cbz x28, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "subs x28, x28, #1\n" - "mov v12.16b, v14.16b\n" - "ldr q8, [%[wbptr], #16]\n" - "mov v10.16b, v14.16b\n" - "ldr q7, [%[wbptr], #32]\n" - "mov v11.16b, v14.16b\n" - "ldr q6, [%[wbptr], #48]\n" - "mov v9.16b, v14.16b\n" - "ldr q5, [%[wbptr], #64]\n" - "ldr q4, [%[wbptr], #80]\n" - "ldr q3, [%[wbptr], #96]\n" - "ldr q2, [%[wbptr], #112]\n" - "ldr q1, [%[wbptr], #128]\n" - "ldr q0, [%[wbptr], #144]\n" - "ldr q15, [%[inptr0]]\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr q20, [x23]\n" - "ldr q13, [%[inptr0], %[input_col_stride1]]\n" - "ldr q17, [x24]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr q16, [x23, %[input_col_stride1]]\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr q18, [%[inptr0], x19]\n" - "ldr q14, [x25]\n" - "ldr q15, [x24, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "beq 3f\n" - "2:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr q19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr q20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr q14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr q13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr q17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr q19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr q18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr q16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr q14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr q15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr q13, [x23, x21]\n" - "str q12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr q20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr q17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr q19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr q18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr q16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr q14, [%[wbptr]]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "ldr q8, [%[wbptr], #16]\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "ldr q7, [%[wbptr], #32]\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "ldr q5, [%[wbptr], #64]\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "ldr q2, [%[wbptr], #112]\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "str q10, [x22]\n" - "mov v12.16b, v14.16b\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "mov v10.16b, v14.16b\n" - "mov v11.16b, v14.16b\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr q6, [%[wbptr], #48]\n" - "ldr q15, [%[inptr0]]\n" - "add x23, x23, #16\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr q20, [x23]\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "ldr q13, [%[inptr0], %[input_col_stride1]]\n" - "add x24, x24, #16\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr q17, [x24]\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr q16, [x23, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "ldr q18, [%[inptr0], x19]\n" - "str q9, [x22, %[output_col_stride1]]\n" - "add x25, x25, #16\n" - "mov v9.16b, v14.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "ldr q14, [x25]\n" - "ldr q15, [x24, %[input_col_stride1]]\n" - "add x26, x26, #16\n" - "add %[outptr0], %[outptr0], #16\n" - "add x22, x22, #16\n" - "subs x28, x28, #1\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "bne 2b\n" - "3:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr q19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr q20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr q14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr q13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr q17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr q19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr q18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr q16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr q14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr q15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr q13, [x23, x21]\n" - "str q12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr q20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr q17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr q19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr q18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr q16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "add x23, x23, #16\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "add x24, x24, #16\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "add x25, x25, #16\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "add x26, x26, #16\n" - "str q10, [x22]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "str q9, [x22, %[output_col_stride1]]\n" - "add x22, x22, #16\n" - "4:\n" - "cbz x27, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v12.16b, v14.16b\n" - "ldr s8, [%[wbptr], #4]\n" - "mov v10.16b, v14.16b\n" - "ldr s7, [%[wbptr], #8]\n" - "mov v11.16b, v14.16b\n" - "ldr s6, [%[wbptr], #12]\n" - "mov v9.16b, v14.16b\n" - "ldr s5, [%[wbptr], #16]\n" - "ldr s4, [%[wbptr], #20]\n" - "subs x27, x27, #1\n" - "ldr s3, [%[wbptr], #24]\n" - "ldr s2, [%[wbptr], #28]\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr s0, [%[wbptr], #36]\n" - "ldr s15, [%[inptr0]]\n" - "ldr s20, [x23]\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr s13, [%[inptr0], %[input_col_stride1]]\n" - "ldr s17, [x24]\n" - "ldr s16, [x23, %[input_col_stride1]]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr s18, [%[inptr0], x19]\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr s14, [x25]\n" - "ldr s15, [x24, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "beq 6f\n" - "5:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr s19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr s20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr s14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr s13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr s17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr s19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr s18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr s16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr s14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr s15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr s13, [x23, x21]\n" - "str s12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr s20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr s17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr s19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr s18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr s16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr s14, [%[wbptr]]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "ldr s8, [%[wbptr], #4]\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "ldr s7, [%[wbptr], #8]\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "ldr s5, [%[wbptr], #16]\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "ldr s2, [%[wbptr], #28]\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "str s10, [x22]\n" - "mov v12.16b, v14.16b\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "mov v10.16b, v14.16b\n" - "mov v11.16b, v14.16b\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr s6, [%[wbptr], #12]\n" - "ldr s15, [%[inptr0]]\n" - "add x23, x23, #4\n" - "fmla v12.4s, v15.4s, v8.4s\n" - "ldr s20, [x23]\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr s13, [%[inptr0], %[input_col_stride1]]\n" - "add x24, x24, #4\n" - "fmla v12.4s, v20.4s, v5.4s\n" - "ldr s17, [x24]\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "fmla v10.4s, v17.4s, v8.4s\n" - "ldr s16, [x23, %[input_col_stride1]]\n" - "fmla v12.4s, v13.4s, v7.4s\n" - "ldr s18, [%[inptr0], x19]\n" - "str s9, [x22, %[output_col_stride1]]\n" - "add x25, x25, #4\n" - "mov v9.16b, v14.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "fmla v12.4s, v17.4s, v2.4s\n" - "ldr s14, [x25]\n" - "ldr s15, [x24, %[input_col_stride1]]\n" - "add x26, x26, #4\n" - "add %[outptr0], %[outptr0], #4\n" - "add x22, x22, #4\n" - "subs x27, x27, #1\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "bne 5b\n" - "6:\n" - "fmla v11.4s, v18.4s, v8.4s\n" - "ldr s19, [x23, x19]\n" - "fmla v10.4s, v14.4s, v5.4s\n" - "ldr s20, [%[inptr0], x20]\n" - "fmla v12.4s, v15.4s, v1.4s\n" - "ldr s14, [x26]\n" - "fmla v11.4s, v19.4s, v5.4s\n" - "ldr s13, [x25, %[input_col_stride1]]\n" - "fmla v10.4s, v15.4s, v7.4s\n" - "ldr s17, [x24, x19]\n" - "fmla v12.4s, v19.4s, v3.4s\n" - "ldr s19, [x23, x20]\n" - "fmla v11.4s, v20.4s, v7.4s\n" - "ldr s18, [%[inptr0], x21]\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "ldr s16, [x26, %[input_col_stride1]]\n" - "fmla v12.4s, v17.4s, v0.4s\n" - "ldr s14, [x25, x19]\n" - "fmla v11.4s, v17.4s, v2.4s\n" - "ldr s15, [x24, x20]\n" - "fmla v10.4s, v13.4s, v4.4s\n" - "ldr s13, [x23, x21]\n" - "str s12, [%[outptr0]]\n" - "fmla v9.4s, v17.4s, v8.4s\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s12, [x26, x19]\n" - "fmla v10.4s, v17.4s, v6.4s\n" - "ldr s20, [x25, x20]\n" - "fmla v9.4s, v14.4s, v5.4s\n" - "ldr s17, [x24, x21]\n" - "fmla v11.4s, v18.4s, v6.4s\n" - "ldr s19, [x26, x20]\n" - "fmla v10.4s, v16.4s, v1.4s\n" - "ldr s18, [x25, x21]\n" - "fmla v9.4s, v15.4s, v7.4s\n" - "ldr s16, [x26, x21]\n" - "fmla v11.4s, v15.4s, v1.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v12.4s, v2.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v11.4s, v13.4s, v3.4s\n" - "add x23, x23, #4\n" - "fmla v10.4s, v12.4s, v0.4s\n" - "add x24, x24, #4\n" - "fmla v9.4s, v20.4s, v4.4s\n" - "add x25, x25, #4\n" - "fmla v11.4s, v17.4s, v0.4s\n" - "add x26, x26, #4\n" - "str s10, [x22]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v9.4s, v18.4s, v3.4s\n" - "fmla v9.4s, v16.4s, v0.4s\n" - "str s9, [x22, %[output_col_stride1]]\n" - "add x22, x22, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - __asm __volatile( - "mov x23, xzr\n" - "mov x24, xzr\n" - "and x25, %[n_channels], #3\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q13, [%[wbptr]]\n" - "ldr x19, [%[inptrs], 0]\n" - "mov v10.16b, v13.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v8.16b, v13.16b\n" - "ldr q6, [%[wbptr], #32]\n" - "mov v9.16b, v13.16b\n" - "ldr q5, [%[wbptr], #48]\n" - "mov v7.16b, v13.16b\n" - "ldr q11, [%[wbptr], #64]\n" - "ldr q4, [%[wbptr], #80]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr q3, [%[wbptr], #96]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr q2, [%[wbptr], #112]\n" - "ldr x27, [%[inptrs], 120]\n" - "ldr q1, [%[wbptr], #128]\n" - "subs x26, x26, #1\n" - "ldr q0, [%[wbptr], #144]\n" - "ldr q14, [x19, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr q18, [x20, x23]\n" - "ldr q14, [x21, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr q16, [x27, x23]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr q19, [x19, x23]\n" - "ldr x21, [%[inptrs], 88]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr q15, [x20, x23]\n" - "ldr q18, [x21, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr q13, [x19, x23]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "beq 3f\n" - "2:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr q14, [x20, x23]\n" - "ldr q17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr q19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr q13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr q18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr q17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr q16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr q19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr q12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr q11, [x22, x23]\n" - "ldr q13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr q14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr q16, [x27, x23]\n" - "ldr q15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "ldr q13, [%[wbptr]]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v10.16b, v13.16b\n" - "ldr q6, [%[wbptr], #32]\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "ldr q11, [%[wbptr], #64]\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "str q8, [x28, x24]\n" - "add x23, x23, #16\n" - "mov v8.16b, v13.16b\n" - "ldr q2, [%[wbptr], #112]\n" - "str q9, [x21, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "ldr q5, [%[wbptr], #48]\n" - "mov v9.16b, v13.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x27, [%[inptrs], 120]\n" - "subs x26, x26, #1\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "ldr q14, [x19, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr q18, [x20, x23]\n" - "ldr q14, [x21, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "ldr q19, [x19, x23]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr q16, [x27, x23]\n" - "ldr q15, [x20, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "str q7, [x28, x24]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v7.16b, v13.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr q13, [x19, x23]\n" - "ldr q18, [x21, x23]\n" - "add x24, x24, #16\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "bne 2b\n" - "3:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr q14, [x20, x23]\n" - "ldr q17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr q19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr q13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr q18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr q17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr q15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr q16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr q19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr q12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr q11, [x22, x23]\n" - "ldr q13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr q14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr q16, [x27, x23]\n" - "ldr q15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "add x23, x23, #16\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "str q8, [x28, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "str q9, [x21, x24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "str q7, [x28, x24]\n" - "add x24, x24, #16\n" - "4:\n" - "cbz x25, 7f\n" - "ldr s13, [%[wbptr]]\n" - "mov v10.16b, v13.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v8.16b, v13.16b\n" - "ldr s6, [%[wbptr], #8]\n" - "mov v9.16b, v13.16b\n" - "ldr s5, [%[wbptr], #12]\n" - "mov v7.16b, v13.16b\n" - "ldr s11, [%[wbptr], #16]\n" - "ldr s4, [%[wbptr], #20]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr s3, [%[wbptr], #24]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr s2, [%[wbptr], #28]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr x27, [%[inptrs], 120]\n" - "ldr s0, [%[wbptr], #36]\n" - "subs x25, x25, #1\n" - "ldr s14, [x19, x23]\n" - "ldr s18, [x20, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr s14, [x21, x23]\n" - "ldr s16, [x27, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr x21, [%[inptrs], 88]\n" - "ldr s19, [x19, x23]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr s15, [x20, x23]\n" - "ldr s18, [x21, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr s13, [x19, x23]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "beq 6f\n" - "5:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr s14, [x20, x23]\n" - "ldr s17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr s19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr s13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr s18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr s17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr s16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr s19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr s12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr s11, [x22, x23]\n" - "ldr s13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr s14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr s16, [x27, x23]\n" - "ldr s15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "ldr s13, [%[wbptr]]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v10.16b, v13.16b\n" - "ldr s6, [%[wbptr], #8]\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "ldr s11, [%[wbptr], #16]\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "str s8, [x28, x24]\n" - "add x23, x23, #4\n" - "mov v8.16b, v13.16b\n" - "ldr s2, [%[wbptr], #28]\n" - "str s9, [x21, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "ldr s5, [%[wbptr], #12]\n" - "mov v9.16b, v13.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x27, [%[inptrs], 120]\n" - "subs x25, x25, #1\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "ldr s14, [x19, x23]\n" - "fmla v10.4s, v14.4s, v12.4s\n" - "ldr s18, [x20, x23]\n" - "ldr s14, [x21, x23]\n" - "ldr x19, [%[inptrs], 8]\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "ldr s19, [x19, x23]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmla v10.4s, v18.4s, v11.4s\n" - "ldr s16, [x27, x23]\n" - "ldr s15, [x20, x23]\n" - "ldr x19, [%[inptrs], 16]\n" - "str s7, [x28, x24]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v7.16b, v13.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr s13, [x19, x23]\n" - "ldr s18, [x21, x23]\n" - "add x24, x24, #4\n" - "fmla v10.4s, v14.4s, v2.4s\n" - "bne 5b\n" - "6:\n" - "fmla v8.4s, v14.4s, v12.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v10.4s, v15.4s, v4.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v9.4s, v13.4s, v12.4s\n" - "ldr s14, [x20, x23]\n" - "ldr s17, [x19, x23]\n" - "ldr x22, [%[inptrs], 160]\n" - "fmla v8.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 128]\n" - "fmla v10.4s, v13.4s, v5.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v9.4s, v14.4s, v11.4s\n" - "ldr s19, [x27, x23]\n" - "ldr x21, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 64]\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v8.4s, v18.4s, v6.4s\n" - "ldr x22, [%[inptrs], 168]\n" - "fmla v10.4s, v18.4s, v1.4s\n" - "ldr s13, [x21, x23]\n" - "fmla v9.4s, v17.4s, v6.4s\n" - "ldr s18, [x20, x23]\n" - "fmla v7.4s, v13.4s, v12.4s\n" - "ldr s17, [x19, x23]\n" - "fmla v8.4s, v15.4s, v2.4s\n" - "ldr s15, [x22, x23]\n" - "fmla v10.4s, v14.4s, v3.4s\n" - "ldr x27, [%[inptrs], 136]\n" - "fmla v9.4s, v13.4s, v2.4s\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr s16, [x27, x23]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v8.4s, v19.4s, v4.4s\n" - "ldr s19, [x21, x23]\n" - "fmla v10.4s, v13.4s, v0.4s\n" - "ldr s12, [x20, x23]\n" - "fmla v9.4s, v18.4s, v4.4s\n" - "ldr x22, [%[inptrs], 176]\n" - "fmla v7.4s, v16.4s, v11.4s\n" - "ldr x27, [%[inptrs], 144]\n" - "fmla v8.4s, v13.4s, v5.4s\n" - "ldr s11, [x22, x23]\n" - "ldr s13, [x27, x23]\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v9.4s, v17.4s, v5.4s\n" - "ldr x22, [%[inptrs], 184]\n" - "fmla v7.4s, v19.4s, v6.4s\n" - "ldr s14, [x21, x23]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s17, [x22, x23]\n" - "ldr x27, [%[inptrs], 152]\n" - "ldr x22, [%[inptrs], 192]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s10, [x21, x24]\n" - "fmla v7.4s, v11.4s, v2.4s\n" - "fmla v8.4s, v16.4s, v3.4s\n" - "ldr s16, [x27, x23]\n" - "ldr s15, [x22, x23]\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v9.4s, v12.4s, v3.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v7.4s, v13.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v8.4s, v11.4s, v0.4s\n" - "add x23, x23, #4\n" - "fmla v9.4s, v14.4s, v0.4s\n" - "fmla v7.4s, v14.4s, v5.4s\n" - "str s8, [x28, x24]\n" - "ldr x28, [%[outptrs], 24]\n" - "str s9, [x21, x24]\n" - "fmla v7.4s, v17.4s, v1.4s\n" - "fmla v7.4s, v16.4s, v3.4s\n" - "fmla v7.4s, v15.4s, v0.4s\n" - "str s7, [x28, x24]\n" - "add x24, x24, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x24, %[inptr0], %[input_row_stride]\n" - "add x27, %[input_col_stride1], %[input_col_stride1]\n" - "add x19, %[outptr0], %[output_row_stride]\n" - "add x25, x24, %[input_row_stride]\n" - "add x23, x27, %[input_col_stride1]\n" - "and x20, %[n_channels], #3\n" - "add x28, x25, %[input_row_stride]\n" - "add x22, x23, %[input_col_stride1]\n" - "lsr x21, %[n_channels], #2\n" - "add x26, x28, %[input_row_stride]\n" - "cbz x21, 4f\n" - "1:\n" - "ldr q16, [%[wbptr]]\n" - "subs x21, x21, #1\n" - "mov v3.16b, v16.16b\n" - "ldr q4, [%[wbptr], #16]\n" - "mov v1.16b, v16.16b\n" - "ldr q5, [%[wbptr], #32]\n" - "mov v2.16b, v16.16b\n" - "ldr q12, [%[wbptr], #48]\n" - "mov v0.16b, v16.16b\n" - "ldr q11, [%[wbptr], #64]\n" - "ldr q10, [%[wbptr], #80]\n" - "ldr q6, [%[wbptr], #96]\n" - "ldr q9, [%[wbptr], #112]\n" - "ldr q8, [%[wbptr], #128]\n" - "ldr q7, [%[wbptr], #144]\n" - "ldr q21, [%[inptr0]]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "ldr q23, [x24]\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "ldr q14, [x25]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr q13, [x24, %[input_col_stride1]]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr q18, [%[inptr0], x27]\n" - "ldr q15, [x28]\n" - "ldr q22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr q17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr q20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr q19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr q15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr q16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr q20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr q19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr q21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr q22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr q23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr q18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr q13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr q14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr q17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr q15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr q16, [%[wbptr]]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "ldr q4, [%[wbptr], #16]\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "ldr q11, [%[wbptr], #64]\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "ldr q5, [%[wbptr], #32]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "ldr q21, [%[inptr0]]\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "ldr q9, [%[wbptr], #112]\n" - "movi v20.16b, #0\n" - "ldr q19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "ldr q18, [%[inptr0], x27]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "ldr q10, [%[wbptr], #80]\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "add x24, x24, #16\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "ldr q23, [x24]\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "mov v3.16b, v16.16b\n" - "ldr q12, [%[wbptr], #48]\n" - "str q1, [x19]\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "mov v1.16b, v16.16b\n" - "ldr q8, [%[wbptr], #128]\n" - "mov v2.16b, v16.16b\n" - "ldr q13, [x24, %[input_col_stride1]]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "ldr q6, [%[wbptr], #96]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "add x25, x25, #16\n" - "ldr q14, [x25]\n" - "add x28, x28, #16\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "ldr q7, [%[wbptr], #144]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr q15, [x28]\n" - "str q0, [x19, %[output_col_stride1]]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "mov v0.16b, v16.16b\n" - "ldr q22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "add x26, x26, #16\n" - "add %[outptr0], %[outptr0], #16\n" - "add x19, x19, #16\n" - "subs x21, x21, #1\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr q17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr q20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr q19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr q15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr q16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr q20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr q19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr q21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr q22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr q23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr q18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr q13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr q14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr q17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr q15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "add x24, x24, #16\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "add x25, x25, #16\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "add x28, x28, #16\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add x26, x26, #16\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "movi v20.16b, #0\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "str q3, [%[outptr0]]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "str q1, [x19]\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str q2, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "str q0, [x19, %[output_col_stride1]]\n" - "add x19, x19, #16\n" - "4:\n" - "cbz x20, 7f\n" - "ldr s16, [%[wbptr]]\n" - "mov v3.16b, v16.16b\n" - "ldr s4, [%[wbptr], #4]\n" - "mov v1.16b, v16.16b\n" - "ldr s5, [%[wbptr], #8]\n" - "mov v2.16b, v16.16b\n" - "ldr s12, [%[wbptr], #12]\n" - "mov v0.16b, v16.16b\n" - "ldr s11, [%[wbptr], #16]\n" - "ldr s10, [%[wbptr], #20]\n" - "subs x20, x20, #1\n" - "ldr s6, [%[wbptr], #24]\n" - "ldr s9, [%[wbptr], #28]\n" - "ldr s8, [%[wbptr], #32]\n" - "ldr s7, [%[wbptr], #36]\n" - "ldr s21, [%[inptr0]]\n" - "ldr s23, [x24]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "ldr s14, [x25]\n" - "ldr s13, [x24, %[input_col_stride1]]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr s18, [%[inptr0], x27]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr s15, [x28]\n" - "ldr s22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr s17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr s20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr s19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr s15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr s16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr s20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr s19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr s21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr s22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr s23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr s18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr s13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr s14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr s17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr s15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr s16, [%[wbptr]]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "ldr s4, [%[wbptr], #4]\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "ldr s11, [%[wbptr], #16]\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "ldr s5, [%[wbptr], #8]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "ldr s21, [%[inptr0]]\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "ldr s9, [%[wbptr], #28]\n" - "movi v20.16b, #0\n" - "ldr s19, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "ldr s18, [%[inptr0], x27]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "ldr s10, [%[wbptr], #20]\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "add x24, x24, #4\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "ldr s23, [x24]\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "mov v3.16b, v16.16b\n" - "ldr s12, [%[wbptr], #12]\n" - "str s1, [x19]\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "mov v1.16b, v16.16b\n" - "ldr s8, [%[wbptr], #32]\n" - "mov v2.16b, v16.16b\n" - "ldr s13, [x24, %[input_col_stride1]]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "ldr s6, [%[wbptr], #24]\n" - "fmla v3.4s, v21.4s, v4.4s\n" - "add x25, x25, #4\n" - "ldr s14, [x25]\n" - "add x28, x28, #4\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "ldr s7, [%[wbptr], #36]\n" - "fmla v3.4s, v23.4s, v11.4s\n" - "ldr s15, [x28]\n" - "str s0, [x19, %[output_col_stride1]]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "mov v0.16b, v16.16b\n" - "ldr s22, [x25, %[input_col_stride1]]\n" - "fmla v3.4s, v19.4s, v5.4s\n" - "add x26, x26, #4\n" - "add %[outptr0], %[outptr0], #4\n" - "add x19, x19, #4\n" - "subs x20, x20, #1\n" - "fmla v3.4s, v14.4s, v9.4s\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v13.4s, v10.4s\n" - "ldr s17, [x24, x27]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr s20, [%[inptr0], x23]\n" - "fmla v1.4s, v15.4s, v11.4s\n" - "ldr s19, [x26]\n" - "fmla v3.4s, v18.4s, v12.4s\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x27]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "ldr s15, [x24, x23]\n" - "fmla v3.4s, v22.4s, v8.4s\n" - "ldr s16, [%[inptr0], x22]\n" - "fmla v2.4s, v20.4s, v5.4s\n" - "ldr s20, [x26, %[input_col_stride1]]\n" - "fmla v1.4s, v19.4s, v9.4s\n" - "ldr s19, [x28, x27]\n" - "fmla v3.4s, v17.4s, v6.4s\n" - "ldr s21, [x25, x23]\n" - "fmla v2.4s, v14.4s, v9.4s\n" - "ldr s22, [x24, x22]\n" - "fmla v1.4s, v13.4s, v10.4s\n" - "ldr s23, [x26, x27]\n" - "fmla v3.4s, v14.4s, v7.4s\n" - "ldr s18, [x28, x23]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "ldr s13, [x25, x22]\n" - "fmla v1.4s, v14.4s, v12.4s\n" - "ldr s14, [x26, x23]\n" - "fmla v2.4s, v15.4s, v10.4s\n" - "ldr s17, [x28, x22]\n" - "fmla v0.4s, v19.4s, v11.4s\n" - "ldr s15, [x26, x22]\n" - "fmla v1.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v21.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v19.4s, v6.4s\n" - "add x24, x24, #4\n" - "fmla v2.4s, v21.4s, v8.4s\n" - "add x25, x25, #4\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "add x28, x28, #4\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "add x26, x26, #4\n" - "fmla v2.4s, v22.4s, v6.4s\n" - "movi v20.16b, #0\n" - "fmla v0.4s, v18.4s, v10.4s\n" - "fmax v3.4s, v3.4s, v20.4s\n" - "fmla v2.4s, v13.4s, v7.4s\n" - "fmax v1.4s, v1.4s, v20.4s\n" - "str s3, [%[outptr0]]\n" - "fmla v0.4s, v13.4s, v12.4s\n" - "str s1, [x19]\n" - "fmax v2.4s, v2.4s, v20.4s\n" - "fmla v0.4s, v14.4s, v8.4s\n" - "str s2, [%[outptr0], %[output_col_stride1]]\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v17.4s, v6.4s\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmax v0.4s, v0.4s, v20.4s\n" - "str s0, [x19, %[output_col_stride1]]\n" - "add x19, x19, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - __asm __volatile( - "mov x22, xzr\n" - "mov x26, xzr\n" - "and x23, %[n_channels], #3\n" - "lsr x24, %[n_channels], #2\n" - "cbz x24, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "ldr x19, [%[inptrs], 0]\n" - "mov v3.16b, v14.16b\n" - "ldr q13, [%[wbptr], #16]\n" - "mov v1.16b, v14.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v2.16b, v14.16b\n" - "ldr q4, [%[wbptr], #48]\n" - "mov v0.16b, v14.16b\n" - "ldr q12, [%[wbptr], #64]\n" - "ldr q9, [%[wbptr], #80]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr q8, [%[wbptr], #96]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr q7, [%[wbptr], #112]\n" - "ldr x25, [%[inptrs], 120]\n" - "ldr q6, [%[wbptr], #128]\n" - "subs x24, x24, #1\n" - "ldr q5, [%[wbptr], #144]\n" - "ldr q15, [x19, x22]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr q17, [x20, x22]\n" - "ldr q16, [x21, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr q15, [x25, x22]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr q10, [x19, x22]\n" - "ldr x21, [%[inptrs], 88]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q17, [x20, x22]\n" - "ldr q14, [x21, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr q18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "beq 3f\n" - "2:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr q16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr q17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr q14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr q18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr q13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr q16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr q17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr q12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr q15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr q11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr q16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "ldr q14, [%[wbptr]]\n" - "add x22, x22, #16\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "ldr q13, [%[wbptr], #16]\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "ldr q12, [%[wbptr], #64]\n" - "mov v3.16b, v14.16b\n" - "ldr q9, [%[wbptr], #80]\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "ldr q7, [%[wbptr], #112]\n" - "str q1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v1.16b, v14.16b\n" - "ldr q4, [%[wbptr], #48]\n" - "str q2, [x21, x26]\n" - "ldr x28, [%[outptrs], 24]\n" - "mov v2.16b, v14.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "ldr q6, [%[wbptr], #128]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x25, [%[inptrs], 120]\n" - "subs x24, x24, #1\n" - "ldr q15, [x19, x22]\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "ldr q8, [%[wbptr], #96]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr q17, [x20, x22]\n" - "ldr q16, [x21, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr q15, [x25, x22]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "ldr q5, [%[wbptr], #144]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr q10, [x19, x22]\n" - "ldr q17, [x20, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "str q0, [x28, x26]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v0.16b, v14.16b\n" - "ldr q18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "ldr q14, [x21, x22]\n" - "add x26, x26, #16\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "bne 2b\n" - "3:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr q16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr q17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr q14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr q17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr q18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr q13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr q16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr q17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr q12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr q15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr q14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr q11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str q3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr q18, [x27, x22]\n" - "ldr q17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr q16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x22, x22, #16\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "str q1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "ldr x28, [%[outptrs], 24]\n" - "str q2, [x21, x26]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "str q0, [x28, x26]\n" - "add x26, x26, #16\n" - "4:\n" - "cbz x23, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v3.16b, v14.16b\n" - "ldr s13, [%[wbptr], #4]\n" - "mov v1.16b, v14.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v2.16b, v14.16b\n" - "ldr s4, [%[wbptr], #12]\n" - "mov v0.16b, v14.16b\n" - "ldr s12, [%[wbptr], #16]\n" - "ldr s9, [%[wbptr], #20]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr s8, [%[wbptr], #24]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr s7, [%[wbptr], #28]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x25, [%[inptrs], 120]\n" - "ldr s5, [%[wbptr], #36]\n" - "subs x23, x23, #1\n" - "ldr s15, [x19, x22]\n" - "ldr s17, [x20, x22]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr s16, [x21, x22]\n" - "ldr s15, [x25, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr x20, [%[inptrs], 48]\n" - "ldr x21, [%[inptrs], 88]\n" - "ldr s10, [x19, x22]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s17, [x20, x22]\n" - "ldr s14, [x21, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "ldr s18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "beq 6f\n" - "5:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr s16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr s17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr s14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr s18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr s13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr s16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr s17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr s12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr s15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr s11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr s16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "ldr s14, [%[wbptr]]\n" - "add x22, x22, #4\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "ldr s13, [%[wbptr], #4]\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "ldr s12, [%[wbptr], #16]\n" - "mov v3.16b, v14.16b\n" - "ldr s9, [%[wbptr], #20]\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "ldr s7, [%[wbptr], #28]\n" - "str s1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v1.16b, v14.16b\n" - "ldr s4, [%[wbptr], #12]\n" - "str s2, [x21, x26]\n" - "ldr x28, [%[outptrs], 24]\n" - "mov v2.16b, v14.16b\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x19, [%[inptrs], 0]\n" - "ldr x20, [%[inptrs], 40]\n" - "ldr x21, [%[inptrs], 80]\n" - "ldr x25, [%[inptrs], 120]\n" - "subs x23, x23, #1\n" - "ldr s15, [x19, x22]\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "ldr s8, [%[wbptr], #24]\n" - "fmla v3.4s, v15.4s, v13.4s\n" - "ldr s17, [x20, x22]\n" - "ldr s16, [x21, x22]\n" - "ldr x19, [%[inptrs], 8]\n" - "ldr s15, [x25, x22]\n" - "ldr x20, [%[inptrs], 48]\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "ldr s5, [%[wbptr], #36]\n" - "fmla v3.4s, v17.4s, v12.4s\n" - "ldr s10, [x19, x22]\n" - "ldr s17, [x20, x22]\n" - "ldr x19, [%[inptrs], 16]\n" - "str s0, [x28, x26]\n" - "ldr x21, [%[inptrs], 88]\n" - "mov v0.16b, v14.16b\n" - "ldr s18, [x19, x22]\n" - "fmla v3.4s, v10.4s, v11.4s\n" - "ldr s14, [x21, x22]\n" - "add x26, x26, #4\n" - "fmla v3.4s, v16.4s, v7.4s\n" - "bne 5b\n" - "6:\n" - "fmla v1.4s, v16.4s, v13.4s\n" - "ldr x20, [%[inptrs], 56]\n" - "fmla v3.4s, v17.4s, v9.4s\n" - "ldr x19, [%[inptrs], 24]\n" - "fmla v2.4s, v18.4s, v13.4s\n" - "ldr s16, [x20, x22]\n" - "movi v10.16b, #0\n" - "ldr s17, [x19, x22]\n" - "fmla v1.4s, v15.4s, v12.4s\n" - "ldr x27, [%[inptrs], 160]\n" - "fmla v3.4s, v18.4s, v4.4s\n" - "ldr x25, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v12.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s15, [x25, x22]\n" - "ldr x21, [%[inptrs], 96]\n" - "fmla v1.4s, v14.4s, v11.4s\n" - "ldr x20, [%[inptrs], 64]\n" - "fmla v3.4s, v14.4s, v6.4s\n" - "ldr s14, [x21, x22]\n" - "fmla v2.4s, v17.4s, v11.4s\n" - "ldr s17, [x20, x22]\n" - "fmla v0.4s, v14.4s, v13.4s\n" - "ldr x19, [%[inptrs], 32]\n" - "fmla v1.4s, v18.4s, v7.4s\n" - "ldr x27, [%[inptrs], 168]\n" - "fmla v3.4s, v16.4s, v8.4s\n" - "ldr s18, [x19, x22]\n" - "fmla v2.4s, v14.4s, v7.4s\n" - "ldr s13, [x27, x22]\n" - "ldr x25, [%[inptrs], 136]\n" - "ldr x21, [%[inptrs], 104]\n" - "ldr x20, [%[inptrs], 72]\n" - "fmla v1.4s, v15.4s, v9.4s\n" - "ldr x27, [%[inptrs], 176]\n" - "fmla v3.4s, v14.4s, v5.4s\n" - "ldr s16, [x25, x22]\n" - "fmla v2.4s, v17.4s, v9.4s\n" - "ldr s17, [x21, x22]\n" - "fmla v0.4s, v16.4s, v12.4s\n" - "ldr s12, [x20, x22]\n" - "fmla v1.4s, v14.4s, v4.4s\n" - "ldr s15, [x27, x22]\n" - "fmax v3.4s, v3.4s, v10.4s\n" - "ldr x25, [%[inptrs], 144]\n" - "fmla v2.4s, v18.4s, v4.4s\n" - "ldr x21, [%[inptrs], 112]\n" - "fmla v0.4s, v17.4s, v11.4s\n" - "ldr s14, [x25, x22]\n" - "fmla v1.4s, v13.4s, v6.4s\n" - "ldr s11, [x21, x22]\n" - "ldr x27, [%[inptrs], 184]\n" - "ldr x25, [%[inptrs], 152]\n" - "ldr x21, [%[outptrs], 0]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr x28, [%[outptrs], 16]\n" - "str s3, [x21, x26]\n" - "fmla v0.4s, v15.4s, v7.4s\n" - "fmla v1.4s, v16.4s, v8.4s\n" - "ldr s18, [x27, x22]\n" - "ldr s17, [x25, x22]\n" - "ldr x27, [%[inptrs], 192]\n" - "fmla v2.4s, v12.4s, v8.4s\n" - "ldr x21, [%[outptrs], 8]\n" - "fmla v0.4s, v14.4s, v9.4s\n" - "ldr s16, [x27, x22]\n" - "fmla v1.4s, v15.4s, v5.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x22, x22, #4\n" - "fmla v2.4s, v11.4s, v5.4s\n" - "fmla v0.4s, v11.4s, v4.4s\n" - "fmax v1.4s, v1.4s, v10.4s\n" - "fmax v2.4s, v2.4s, v10.4s\n" - "str s1, [x28, x26]\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "ldr x28, [%[outptrs], 24]\n" - "str s2, [x21, x26]\n" - "fmla v0.4s, v17.4s, v8.4s\n" - "fmla v0.4s, v16.4s, v5.4s\n" - "fmax v0.4s, v0.4s, v10.4s\n" - "str s0, [x28, x26]\n" - "add x26, x26, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x21, %[inptr0], %[input_row_stride]\n" - "add x23, %[input_col_stride1], %[input_col_stride1]\n" - "add x24, %[outptr0], %[output_row_stride]\n" - "add x28, x21, %[input_row_stride]\n" - "add x26, x23, %[input_col_stride1]\n" - "and x19, %[n_channels], #3\n" - "add x27, x28, %[input_row_stride]\n" - "add x25, x26, %[input_col_stride1]\n" - "lsr x20, %[n_channels], #2\n" - "add x22, x27, %[input_row_stride]\n" - "cbz x20, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "subs x20, x20, #1\n" - "mov v5.16b, v14.16b\n" - "ldr q0, [%[wbptr], #16]\n" - "mov v11.16b, v14.16b\n" - "ldr q1, [%[wbptr], #32]\n" - "mov v12.16b, v14.16b\n" - "ldr q2, [%[wbptr], #48]\n" - "mov v10.16b, v14.16b\n" - "ldr q6, [%[wbptr], #64]\n" - "ldr q3, [%[wbptr], #80]\n" - "ldr q7, [%[wbptr], #96]\n" - "ldr q4, [%[wbptr], #112]\n" - "ldr q8, [%[wbptr], #128]\n" - "ldr q9, [%[wbptr], #144]\n" - "ldr q19, [%[inptr0]]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "ldr q15, [x21]\n" - "ldr q21, [%[inptr0], %[input_col_stride1]]\n" - "ldr q16, [x28]\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "ldr q23, [x21, %[input_col_stride1]]\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "ldr q18, [%[inptr0], x23]\n" - "ldr q17, [x27]\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "beq 3f\n" - "2:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr q21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr q20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr q19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr q15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr q16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr q17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr q14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr q20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr q22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr q21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr q23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr q18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr q15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr q16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr q17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr q13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "ldr q14, [%[wbptr]]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "ldr q0, [%[wbptr], #16]\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "ldr q6, [%[wbptr], #64]\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "ldr q1, [%[wbptr], #32]\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "ldr q19, [%[inptr0]]\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "ldr q4, [%[wbptr], #112]\n" - "movi v20.16b, #0\n" - "ldr q21, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "ldr q18, [%[inptr0], x23]\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "ldr q3, [%[wbptr], #80]\n" - "fmov v22.4s, #6.0\n" - "add x21, x21, #16\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "ldr q15, [x21]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "ldr q2, [%[wbptr], #48]\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "ldr q23, [x21, %[input_col_stride1]]\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "add x28, x28, #16\n" - "str q5, [%[outptr0]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "ldr q8, [%[wbptr], #128]\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "ldr q16, [x28]\n" - "str q12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v5.16b, v14.16b\n" - "ldr q13, [x28, %[input_col_stride1]]\n" - "str q11, [x24]\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "mov v11.16b, v14.16b\n" - "ldr q9, [%[wbptr], #144]\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "add x27, x27, #16\n" - "mov v12.16b, v14.16b\n" - "ldr q17, [x27]\n" - "str q10, [x24, %[output_col_stride1]]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "mov v10.16b, v14.16b\n" - "add x22, x22, #16\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "add x24, x24, #16\n" - "subs x20, x20, #1\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "bne 2b\n" - "3:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr q21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr q20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr q19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr q15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr q16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr q17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr q14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr q20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr q19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr q22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr q21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr q23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr q18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr q15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr q16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr q17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr q13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "add x21, x21, #16\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "add x28, x28, #16\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "add x27, x27, #16\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add x22, x22, #16\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "movi v20.16b, #0\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "fmov v22.4s, #6.0\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "str q5, [%[outptr0]]\n" - "str q11, [x24]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "str q12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "str q10, [x24, %[output_col_stride1]]\n" - "add x24, x24, #16\n" - "4:\n" - "cbz x19, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v5.16b, v14.16b\n" - "ldr s0, [%[wbptr], #4]\n" - "mov v11.16b, v14.16b\n" - "ldr s1, [%[wbptr], #8]\n" - "mov v12.16b, v14.16b\n" - "ldr s2, [%[wbptr], #12]\n" - "mov v10.16b, v14.16b\n" - "ldr s6, [%[wbptr], #16]\n" - "ldr s3, [%[wbptr], #20]\n" - "subs x19, x19, #1\n" - "ldr s7, [%[wbptr], #24]\n" - "ldr s4, [%[wbptr], #28]\n" - "ldr s8, [%[wbptr], #32]\n" - "ldr s9, [%[wbptr], #36]\n" - "ldr s19, [%[inptr0]]\n" - "ldr s15, [x21]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "ldr s21, [%[inptr0], %[input_col_stride1]]\n" - "ldr s16, [x28]\n" - "ldr s23, [x21, %[input_col_stride1]]\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "ldr s18, [%[inptr0], x23]\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "ldr s17, [x27]\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "beq 6f\n" - "5:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr s21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr s20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr s19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr s15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr s16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr s17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr s14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr s20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr s22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr s21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr s23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr s18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr s15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr s16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr s17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr s13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "ldr s14, [%[wbptr]]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "ldr s0, [%[wbptr], #4]\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "ldr s6, [%[wbptr], #16]\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "ldr s1, [%[wbptr], #8]\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "ldr s19, [%[inptr0]]\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "ldr s4, [%[wbptr], #28]\n" - "movi v20.16b, #0\n" - "ldr s21, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "ldr s18, [%[inptr0], x23]\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "ldr s3, [%[wbptr], #20]\n" - "fmov v22.4s, #6.0\n" - "add x21, x21, #4\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "ldr s15, [x21]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "ldr s2, [%[wbptr], #12]\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "ldr s23, [x21, %[input_col_stride1]]\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "add x28, x28, #4\n" - "str s5, [%[outptr0]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "ldr s8, [%[wbptr], #32]\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "ldr s16, [x28]\n" - "str s12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v5.16b, v14.16b\n" - "ldr s13, [x28, %[input_col_stride1]]\n" - "str s11, [x24]\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "mov v11.16b, v14.16b\n" - "ldr s9, [%[wbptr], #36]\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "add x27, x27, #4\n" - "mov v12.16b, v14.16b\n" - "ldr s17, [x27]\n" - "str s10, [x24, %[output_col_stride1]]\n" - "fmla v5.4s, v19.4s, v0.4s\n" - "mov v10.16b, v14.16b\n" - "add x22, x22, #4\n" - "fmla v11.4s, v16.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v5.4s, v15.4s, v6.4s\n" - "add x24, x24, #4\n" - "subs x19, x19, #1\n" - "fmla v5.4s, v21.4s, v1.4s\n" - "fmla v5.4s, v16.4s, v4.4s\n" - "bne 5b\n" - "6:\n" - "fmla v5.4s, v23.4s, v3.4s\n" - "ldr s21, [x21, x23]\n" - "fmla v12.4s, v18.4s, v0.4s\n" - "ldr s20, [%[inptr0], x26]\n" - "fmla v11.4s, v17.4s, v6.4s\n" - "ldr s19, [x22]\n" - "fmla v5.4s, v18.4s, v2.4s\n" - "ldr s15, [x27, %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v6.4s\n" - "ldr s16, [x28, x23]\n" - "fmla v11.4s, v13.4s, v1.4s\n" - "ldr s17, [x21, x26]\n" - "fmla v5.4s, v13.4s, v8.4s\n" - "ldr s14, [%[inptr0], x25]\n" - "fmla v12.4s, v20.4s, v1.4s\n" - "ldr s20, [x22, %[input_col_stride1]]\n" - "fmla v11.4s, v19.4s, v4.4s\n" - "ldr s19, [x27, x23]\n" - "fmla v5.4s, v21.4s, v7.4s\n" - "ldr s22, [x28, x26]\n" - "fmla v12.4s, v16.4s, v4.4s\n" - "ldr s21, [x21, x25]\n" - "fmla v11.4s, v15.4s, v3.4s\n" - "ldr s23, [x22, x23]\n" - "fmla v5.4s, v16.4s, v9.4s\n" - "ldr s18, [x27, x26]\n" - "fmla v10.4s, v16.4s, v0.4s\n" - "ldr s15, [x28, x25]\n" - "fmla v11.4s, v16.4s, v2.4s\n" - "ldr s16, [x22, x26]\n" - "fmla v12.4s, v17.4s, v3.4s\n" - "ldr s17, [x27, x25]\n" - "fmla v10.4s, v19.4s, v6.4s\n" - "ldr s13, [x22, x25]\n" - "fmla v11.4s, v20.4s, v8.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v12.4s, v14.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v10.4s, v22.4s, v1.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v11.4s, v19.4s, v7.4s\n" - "add x21, x21, #4\n" - "fmla v12.4s, v22.4s, v8.4s\n" - "add x28, x28, #4\n" - "fmla v10.4s, v23.4s, v4.4s\n" - "add x27, x27, #4\n" - "fmla v11.4s, v23.4s, v9.4s\n" - "add x22, x22, #4\n" - "fmla v12.4s, v21.4s, v7.4s\n" - "movi v20.16b, #0\n" - "fmla v10.4s, v18.4s, v3.4s\n" - "fmov v22.4s, #6.0\n" - "fmax v5.4s, v5.4s, v20.4s\n" - "fmax v11.4s, v11.4s, v20.4s\n" - "fmla v12.4s, v15.4s, v9.4s\n" - "fmla v10.4s, v15.4s, v2.4s\n" - "fmin v5.4s, v5.4s, v22.4s\n" - "fmin v11.4s, v11.4s, v22.4s\n" - "fmax v12.4s, v12.4s, v20.4s\n" - "str s5, [%[outptr0]]\n" - "str s11, [x24]\n" - "fmla v10.4s, v16.4s, v8.4s\n" - "fmin v12.4s, v12.4s, v22.4s\n" - "str s12, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v17.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v10.4s, v13.4s, v9.4s\n" - "fmax v10.4s, v10.4s, v20.4s\n" - "fmin v10.4s, v10.4s, v22.4s\n" - "str s10, [x24, %[output_col_stride1]]\n" - "add x24, x24, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - __asm __volatile( - "mov x27, xzr\n" - "mov x28, xzr\n" - "and x26, %[n_channels], #3\n" - "lsr x25, %[n_channels], #2\n" - "cbz x25, 4f\n" - "1:\n" - "ldr q15, [%[wbptr]]\n" - "ldr x21, [%[inptrs], 0]\n" - "mov v8.16b, v15.16b\n" - "ldr q14, [%[wbptr], #16]\n" - "mov v3.16b, v15.16b\n" - "ldr q10, [%[wbptr], #32]\n" - "mov v2.16b, v15.16b\n" - "ldr q7, [%[wbptr], #48]\n" - "mov v4.16b, v15.16b\n" - "ldr q13, [%[wbptr], #64]\n" - "ldr q5, [%[wbptr], #80]\n" - "ldr x22, [%[inptrs], 40]\n" - "ldr q0, [%[wbptr], #96]\n" - "ldr x20, [%[inptrs], 80]\n" - "ldr q9, [%[wbptr], #112]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr q6, [%[wbptr], #128]\n" - "subs x25, x25, #1\n" - "ldr q1, [%[wbptr], #144]\n" - "ldr q17, [x21, x27]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "ldr q17, [x23, x27]\n" - "ldr x22, [%[inptrs], 48]\n" - "ldr q11, [x21, x27]\n" - "ldr x20, [%[inptrs], 88]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr q19, [x22, x27]\n" - "ldr q15, [x20, x27]\n" - "ldr x21, [%[inptrs], 16]\n" - "ldr q12, [x21, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "beq 3f\n" - "2:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr q16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr q18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr q19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr q17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr q19, [x21, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr q17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr q13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr q10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr q9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr q19, [x23, x27]\n" - "str q8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr q16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "ldr q15, [%[wbptr]]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "ldr q14, [%[wbptr], #16]\n" - "mov v8.16b, v15.16b\n" - "ldr q10, [%[wbptr], #32]\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "ldr q13, [%[wbptr], #64]\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "ldr q7, [%[wbptr], #48]\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "ldr q5, [%[wbptr], #80]\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "ldr q9, [%[wbptr], #112]\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x27, x27, #16\n" - "str q3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str q2, [x22, x28]\n" - "mov v3.16b, v15.16b\n" - "mov v2.16b, v15.16b\n" - "ldr q6, [%[wbptr], #128]\n" - "ldr x24, [%[outptrs], 24]\n" - "ldr x21, [%[inptrs], 0]\n" - "ldr x22, [%[inptrs], 40]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "ldr q0, [%[wbptr], #96]\n" - "ldr q17, [x21, x27]\n" - "ldr x20, [%[inptrs], 80]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "ldr q1, [%[wbptr], #144]\n" - "ldr q11, [x21, x27]\n" - "ldr x22, [%[inptrs], 48]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr x21, [%[inptrs], 16]\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "ldr q19, [x22, x27]\n" - "ldr q12, [x21, x27]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr x20, [%[inptrs], 88]\n" - "subs x25, x25, #1\n" - "str q4, [x24, x28]\n" - "mov v4.16b, v15.16b\n" - "ldr q17, [x23, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "ldr q15, [x20, x27]\n" - "add x28, x28, #16\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "bne 2b\n" - "3:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr q16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr q18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr q19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr q17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr q19, [x21, x27]\n" - "ldr q16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr q14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr q17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr q18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr q13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr q10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr q15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr q9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr q19, [x23, x27]\n" - "str q8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr q16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "add x27, x27, #16\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "str q3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str q2, [x22, x28]\n" - "ldr x24, [%[outptrs], 24]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "str q4, [x24, x28]\n" - "add x28, x28, #16\n" - "4:\n" - "cbz x26, 7f\n" - "ldr s15, [%[wbptr]]\n" - "mov v8.16b, v15.16b\n" - "ldr s14, [%[wbptr], #4]\n" - "mov v3.16b, v15.16b\n" - "ldr s10, [%[wbptr], #8]\n" - "mov v2.16b, v15.16b\n" - "ldr s7, [%[wbptr], #12]\n" - "mov v4.16b, v15.16b\n" - "ldr s13, [%[wbptr], #16]\n" - "ldr s5, [%[wbptr], #20]\n" - "ldr x21, [%[inptrs], 0]\n" - "ldr s0, [%[wbptr], #24]\n" - "ldr x22, [%[inptrs], 40]\n" - "ldr s9, [%[wbptr], #28]\n" - "ldr x20, [%[inptrs], 80]\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr s1, [%[wbptr], #36]\n" - "subs x26, x26, #1\n" - "ldr s17, [x21, x27]\n" - "ldr s18, [x22, x27]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr s16, [x20, x27]\n" - "ldr s17, [x23, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "ldr x22, [%[inptrs], 48]\n" - "ldr x20, [%[inptrs], 88]\n" - "ldr s11, [x21, x27]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr s19, [x22, x27]\n" - "ldr s15, [x20, x27]\n" - "ldr x21, [%[inptrs], 16]\n" - "ldr s12, [x21, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "beq 6f\n" - "5:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr s16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr s18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr s19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr s17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr s19, [x21, x27]\n" - "ldr s16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr s17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr s13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr s10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr s9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr s19, [x23, x27]\n" - "str s8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr s16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "ldr s15, [%[wbptr]]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "ldr s14, [%[wbptr], #4]\n" - "mov v8.16b, v15.16b\n" - "ldr s10, [%[wbptr], #8]\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "ldr s13, [%[wbptr], #16]\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "ldr s7, [%[wbptr], #12]\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "ldr s5, [%[wbptr], #20]\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "ldr s9, [%[wbptr], #28]\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "add x27, x27, #4\n" - "str s3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str s2, [x22, x28]\n" - "mov v3.16b, v15.16b\n" - "mov v2.16b, v15.16b\n" - "ldr s6, [%[wbptr], #32]\n" - "ldr x24, [%[outptrs], 24]\n" - "ldr x21, [%[inptrs], 0]\n" - "ldr x22, [%[inptrs], 40]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "ldr s0, [%[wbptr], #24]\n" - "ldr s17, [x21, x27]\n" - "ldr x20, [%[inptrs], 80]\n" - "fmla v8.4s, v17.4s, v14.4s\n" - "ldr s18, [x22, x27]\n" - "ldr s16, [x20, x27]\n" - "ldr x21, [%[inptrs], 8]\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "ldr s1, [%[wbptr], #36]\n" - "ldr s11, [x21, x27]\n" - "ldr x22, [%[inptrs], 48]\n" - "fmla v8.4s, v18.4s, v13.4s\n" - "ldr x21, [%[inptrs], 16]\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "ldr s19, [x22, x27]\n" - "ldr s12, [x21, x27]\n" - "ldr x23, [%[inptrs], 120]\n" - "ldr x20, [%[inptrs], 88]\n" - "subs x26, x26, #1\n" - "str s4, [x24, x28]\n" - "mov v4.16b, v15.16b\n" - "ldr s17, [x23, x27]\n" - "fmla v8.4s, v11.4s, v10.4s\n" - "ldr s15, [x20, x27]\n" - "add x28, x28, #4\n" - "fmla v8.4s, v16.4s, v9.4s\n" - "bne 5b\n" - "6:\n" - "fmla v3.4s, v16.4s, v14.4s\n" - "ldr x22, [%[inptrs], 56]\n" - "fmla v8.4s, v19.4s, v5.4s\n" - "ldr x21, [%[inptrs], 24]\n" - "fmla v2.4s, v12.4s, v14.4s\n" - "ldr s16, [x22, x27]\n" - "movi v11.16b, #0\n" - "ldr s18, [x21, x27]\n" - "fmla v3.4s, v17.4s, v13.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v8.4s, v12.4s, v7.4s\n" - "ldr x23, [%[inptrs], 128]\n" - "fmla v2.4s, v16.4s, v13.4s\n" - "ldr s19, [x20, x27]\n" - "fmov v12.4s, #6.0\n" - "ldr s17, [x23, x27]\n" - "fmla v3.4s, v15.4s, v10.4s\n" - "ldr x20, [%[inptrs], 96]\n" - "fmla v8.4s, v15.4s, v6.4s\n" - "ldr x22, [%[inptrs], 64]\n" - "fmla v2.4s, v18.4s, v10.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v15.4s, v14.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v3.4s, v19.4s, v9.4s\n" - "ldr x21, [%[inptrs], 32]\n" - "fmla v8.4s, v16.4s, v0.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v2.4s, v15.4s, v9.4s\n" - "ldr s19, [x21, x27]\n" - "ldr s16, [x20, x27]\n" - "ldr x23, [%[inptrs], 136]\n" - "fmla v3.4s, v17.4s, v5.4s\n" - "ldr x20, [%[inptrs], 104]\n" - "fmla v8.4s, v15.4s, v1.4s\n" - "ldr s14, [x23, x27]\n" - "fmla v2.4s, v18.4s, v5.4s\n" - "ldr s17, [x20, x27]\n" - "fmla v4.4s, v14.4s, v13.4s\n" - "ldr x22, [%[inptrs], 72]\n" - "fmla v3.4s, v15.4s, v7.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmax v8.4s, v8.4s, v11.4s\n" - "ldr s18, [x22, x27]\n" - "fmla v2.4s, v19.4s, v7.4s\n" - "ldr s13, [x20, x27]\n" - "fmla v4.4s, v17.4s, v10.4s\n" - "ldr x23, [%[inptrs], 144]\n" - "fmla v3.4s, v16.4s, v6.4s\n" - "ldr x20, [%[inptrs], 112]\n" - "fmin v8.4s, v8.4s, v12.4s\n" - "ldr s10, [x23, x27]\n" - "fmla v2.4s, v17.4s, v6.4s\n" - "ldr s15, [x20, x27]\n" - "fmla v4.4s, v13.4s, v9.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v3.4s, v14.4s, v0.4s\n" - "ldr x23, [%[inptrs], 152]\n" - "ldr s9, [x20, x27]\n" - "ldr x22, [%[outptrs], 0]\n" - "fmla v2.4s, v18.4s, v0.4s\n" - "ldr s19, [x23, x27]\n" - "str s8, [x22, x28]\n" - "fmla v4.4s, v10.4s, v5.4s\n" - "fmla v3.4s, v13.4s, v1.4s\n" - "ldr x20, [%[inptrs], 192]\n" - "ldr x22, [%[outptrs], 8]\n" - "ldr x24, [%[outptrs], 16]\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v15.4s, v1.4s\n" - "ldr s16, [x20, x27]\n" - "fmla v4.4s, v15.4s, v7.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmax v3.4s, v3.4s, v11.4s\n" - "add x27, x27, #4\n" - "fmax v2.4s, v2.4s, v11.4s\n" - "fmla v4.4s, v9.4s, v6.4s\n" - "fmin v3.4s, v3.4s, v12.4s\n" - "fmin v2.4s, v2.4s, v12.4s\n" - "str s3, [x24, x28]\n" - "fmla v4.4s, v19.4s, v0.4s\n" - "str s2, [x22, x28]\n" - "ldr x24, [%[outptrs], 24]\n" - "fmla v4.4s, v16.4s, v1.4s\n" - "fmax v4.4s, v4.4s, v11.4s\n" - "fmin v4.4s, v4.4s, v12.4s\n" - "str s4, [x24, x28]\n" - "add x28, x28, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp deleted file mode 100644 index 2142c431ac..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp +++ /dev/null @@ -1,2341 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x20, %[inptr0], %[input_row_stride]\n" - "add x13, %[input_col_stride1], %[input_col_stride1]\n" - "add x24, %[outptr0], %[output_row_stride]\n" - "add x21, x20, %[input_row_stride]\n" - "add x14, x13, #64\n" - "add x15, x13, %[input_col_stride1]\n" - "add x22, x21, %[input_row_stride]\n" - "add x16, x15, #64\n" - "add x17, x15, %[input_col_stride1]\n" - "add x23, x22, %[input_row_stride]\n" - "add x9, x17, #64\n" - "add x25, x24, %[output_row_stride]\n" - "add x26, %[output_col_stride1], %[output_col_stride1]\n" - "and x27, %[n_channels], #3\n" - "lsr x28, %[n_channels], #2\n" - "cbz x28, 4f\n" - "1:\n" - "ldr q25, [%[wbptr]]\n" - "subs x28, x28, #1\n" - "mov v17.16b, v25.16b\n" - "ldr q16, [%[wbptr], #16]\n" - "mov v13.16b, v25.16b\n" - "ldr q7, [%[wbptr], #32]\n" - "mov v15.16b, v25.16b\n" - "ldr q6, [%[wbptr], #48]\n" - "mov v10.16b, v25.16b\n" - "ldr q5, [%[wbptr], #64]\n" - "mov v12.16b, v25.16b\n" - "ldr q4, [%[wbptr], #80]\n" - "mov v14.16b, v25.16b\n" - "ldr q3, [%[wbptr], #96]\n" - "mov v9.16b, v25.16b\n" - "ldr q2, [%[wbptr], #112]\n" - "mov v11.16b, v25.16b\n" - "ldr q1, [%[wbptr], #128]\n" - "mov v8.16b, v25.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "ldr q26, [%[inptr0]]\n" - "ldr q28, [x20]\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "ldr q29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "ldr q27, [x21]\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "ldr q21, [x20, %[input_col_stride1]]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "ldr q20, [%[inptr0], x13]\n" - "ldr q23, [x22]\n" - "ldr q19, [x21, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x20, x19]\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x21, x19]\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr q30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr q29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr q28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr q24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr q18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr q22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr q25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr q26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr q27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr q21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr q20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr q19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr q23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "str q17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr q17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "add x20, x20, #16\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr q24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr q18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x20, x19]\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "ldr q22, [x23, x17]\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "add x21, x21, #16\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "ldr q25, [%[wbptr]]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "ldr q16, [%[wbptr], #16]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "prfm pldl1keep, [x21, x19]\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "add x22, x22, #16\n" - "str q13, [x24]\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "ldr q26, [%[inptr0]]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "ldr q28, [x20]\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "ldr q29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "add x23, x23, #16\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "ldr q5, [%[wbptr], #64]\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "ldr q27, [x21]\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "ldr q21, [x20, %[input_col_stride1]]\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "ldr q20, [%[inptr0], x13]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "subs x28, x28, #1\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "ldr q7, [%[wbptr], #32]\n" - "str q10, [x25]\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "ldr q2, [%[wbptr], #112]\n" - "str q12, [x24, %[output_col_stride1]]\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "ldr q23, [x22]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "ldr q19, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "str q14, [%[outptr0], x26]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "ldr q6, [%[wbptr], #48]\n" - "str q9, [x25, %[output_col_stride1]]\n" - "mov v17.16b, v25.16b\n" - "str q11, [x24, x26]\n" - "mov v13.16b, v25.16b\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "mov v15.16b, v25.16b\n" - "add x24, x24, #16\n" - "mov v10.16b, v25.16b\n" - "mov v12.16b, v25.16b\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "mov v14.16b, v25.16b\n" - "mov v9.16b, v25.16b\n" - "mov v11.16b, v25.16b\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "str q8, [x25, x26]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "mov v8.16b, v25.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "add x25, x25, #16\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr q30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr q29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr q28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr q24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr q18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr q22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr q25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr q26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr q27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr q21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr q20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr q19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr q23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "add x20, x20, #16\n" - "str q17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr q17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "add x21, x21, #16\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr q24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr q18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "add x22, x22, #16\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "ldr q22, [x23, x17]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "add x23, x23, #16\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "str q13, [x24]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "str q10, [x25]\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "str q12, [x24, %[output_col_stride1]]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "str q14, [%[outptr0], x26]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q9, [x25, %[output_col_stride1]]\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "str q11, [x24, x26]\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "add x24, x24, #16\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "str q8, [x25, x26]\n" - "add x25, x25, #16\n" - "4:\n" - "cbz x27, 7f\n" - "ldr s25, [%[wbptr]]\n" - "mov v17.16b, v25.16b\n" - "ldr s16, [%[wbptr], #4]\n" - "mov v13.16b, v25.16b\n" - "ldr s7, [%[wbptr], #8]\n" - "mov v15.16b, v25.16b\n" - "ldr s6, [%[wbptr], #12]\n" - "mov v10.16b, v25.16b\n" - "ldr s5, [%[wbptr], #16]\n" - "mov v12.16b, v25.16b\n" - "ldr s4, [%[wbptr], #20]\n" - "mov v14.16b, v25.16b\n" - "ldr s3, [%[wbptr], #24]\n" - "mov v9.16b, v25.16b\n" - "ldr s2, [%[wbptr], #28]\n" - "mov v11.16b, v25.16b\n" - "ldr s1, [%[wbptr], #32]\n" - "mov v8.16b, v25.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "ldr s26, [%[inptr0]]\n" - "subs x27, x27, #1\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "ldr s28, [x20]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "ldr s29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "ldr s27, [x21]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "ldr s21, [x20, %[input_col_stride1]]\n" - "ldr s20, [%[inptr0], x13]\n" - "ldr s23, [x22]\n" - "ldr s19, [x21, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x20, x19]\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x21, x19]\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr s30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr s29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr s28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr s24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr s18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr s22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr s25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr s26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr s27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr s21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr s20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr s19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr s23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "str s17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr s17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x19]\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], x14]\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "add x20, x20, #4\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr s24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr s18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x20, x19]\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "ldr s22, [x23, x17]\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "add x21, x21, #4\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "ldr s25, [%[wbptr]]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "ldr s16, [%[wbptr], #4]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "prfm pldl1keep, [x21, x19]\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "add x22, x22, #4\n" - "str s13, [x24]\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "ldr s26, [%[inptr0]]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "ldr s28, [x20]\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "ldr s29, [%[inptr0], %[input_col_stride1]]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "add x23, x23, #4\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "ldr s5, [%[wbptr], #16]\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "ldr s27, [x21]\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "ldr s21, [x20, %[input_col_stride1]]\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "ldr s20, [%[inptr0], x13]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "subs x27, x27, #1\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "ldr s7, [%[wbptr], #8]\n" - "str s10, [x25]\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "ldr s2, [%[wbptr], #28]\n" - "str s12, [x24, %[output_col_stride1]]\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "ldr s23, [x22]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "ldr s19, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "str s14, [%[outptr0], x26]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "ldr s6, [%[wbptr], #12]\n" - "str s9, [x25, %[output_col_stride1]]\n" - "mov v17.16b, v25.16b\n" - "str s11, [x24, x26]\n" - "mov v13.16b, v25.16b\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "mov v15.16b, v25.16b\n" - "add x24, x24, #4\n" - "mov v10.16b, v25.16b\n" - "mov v12.16b, v25.16b\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "mov v14.16b, v25.16b\n" - "mov v9.16b, v25.16b\n" - "mov v11.16b, v25.16b\n" - "fmla v17.4s, v26.4s, v16.4s\n" - "str s8, [x25, x26]\n" - "fmla v13.4s, v28.4s, v16.4s\n" - "mov v8.16b, v25.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "fmla v17.4s, v28.4s, v5.4s\n" - "fmla v15.4s, v29.4s, v16.4s\n" - "add x25, x25, #4\n" - "fmla v17.4s, v29.4s, v7.4s\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "ldr s30, [x20, x13]\n" - "fmla v13.4s, v27.4s, v5.4s\n" - "ldr s29, [%[inptr0], x15]\n" - "fmla v10.4s, v27.4s, v16.4s\n" - "ldr s28, [x23]\n" - "fmla v17.4s, v21.4s, v4.4s\n" - "ldr s24, [x22, %[input_col_stride1]]\n" - "fmla v13.4s, v21.4s, v7.4s\n" - "ldr s18, [x21, x13]\n" - "fmla v15.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [x20, x14]\n" - "fmla v12.4s, v21.4s, v16.4s\n" - "ldr s22, [x20, x15]\n" - "fmla v17.4s, v20.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v15.4s, v20.4s, v7.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v14.4s, v20.4s, v16.4s\n" - "ldr s25, [%[inptr0], x17]\n" - "fmla v13.4s, v23.4s, v2.4s\n" - "prfm pldl1keep, [x22, x19]\n" - "fmla v10.4s, v23.4s, v5.4s\n" - "ldr s26, [x23, %[input_col_stride1]]\n" - "fmla v17.4s, v19.4s, v1.4s\n" - "prfm pldl1keep, [x21, x14]\n" - "fmla v13.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v15.4s, v19.4s, v2.4s\n" - "prfm pldl1keep, [%[inptr0], x9]\n" - "fmla v10.4s, v19.4s, v7.4s\n" - "prfm pldl1keep, [x23, x19]\n" - "fmla v12.4s, v19.4s, v5.4s\n" - "prfm pldl1keep, [x22, x14]\n" - "fmla v9.4s, v19.4s, v16.4s\n" - "ldr s27, [x22, x13]\n" - "fmla v17.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v13.4s, v30.4s, v6.4s\n" - "prfm pldl1keep, [x20, x9]\n" - "fmla v15.4s, v30.4s, v4.4s\n" - "prfm pldl1keep, [x23, x14]\n" - "fmla v12.4s, v30.4s, v7.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x21, x9]\n" - "fmla v11.4s, v30.4s, v16.4s\n" - "ldr s21, [x21, x15]\n" - "fmla v15.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr s20, [x20, x17]\n" - "fmla v10.4s, v28.4s, v2.4s\n" - "ldr s19, [x23, x13]\n" - "fmla v13.4s, v24.4s, v1.4s\n" - "prfm pldl1keep, [x22, x9]\n" - "fmla v12.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x9]\n" - "fmla v10.4s, v24.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v9.4s, v24.4s, v5.4s\n" - "ldr s23, [x22, x15]\n" - "fmla v17.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v13.4s, v18.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v15.4s, v18.4s, v1.4s\n" - "add x20, x20, #4\n" - "str s17, [%[outptr0]]\n" - "fmla v10.4s, v18.4s, v6.4s\n" - "fmla v12.4s, v18.4s, v4.4s\n" - "ldr s17, [x21, x17]\n" - "fmla v14.4s, v18.4s, v2.4s\n" - "add x21, x21, #4\n" - "fmla v9.4s, v18.4s, v7.4s\n" - "fmla v11.4s, v18.4s, v5.4s\n" - "fmla v8.4s, v18.4s, v16.4s\n" - "ldr s24, [x23, x15]\n" - "fmla v15.4s, v22.4s, v3.4s\n" - "ldr s18, [x22, x17]\n" - "fmla v12.4s, v22.4s, v6.4s\n" - "add x22, x22, #4\n" - "fmla v14.4s, v22.4s, v4.4s\n" - "fmla v11.4s, v22.4s, v7.4s\n" - "fmla v10.4s, v26.4s, v1.4s\n" - "ldr s22, [x23, x17]\n" - "fmla v9.4s, v26.4s, v2.4s\n" - "add x23, x23, #4\n" - "fmla v14.4s, v25.4s, v6.4s\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "fmla v10.4s, v27.4s, v3.4s\n" - "fmla v12.4s, v27.4s, v1.4s\n" - "fmla v9.4s, v27.4s, v4.4s\n" - "fmla v11.4s, v27.4s, v2.4s\n" - "str s13, [x24]\n" - "fmla v8.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v21.4s, v0.4s\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "fmla v14.4s, v21.4s, v1.4s\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "fmla v8.4s, v21.4s, v7.4s\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v10.4s, v19.4s, v0.4s\n" - "fmla v14.4s, v20.4s, v3.4s\n" - "fmla v9.4s, v19.4s, v1.4s\n" - "fmla v11.4s, v20.4s, v6.4s\n" - "fmla v8.4s, v19.4s, v2.4s\n" - "str s10, [x25]\n" - "fmla v12.4s, v23.4s, v0.4s\n" - "fmla v9.4s, v23.4s, v3.4s\n" - "fmla v14.4s, v17.4s, v0.4s\n" - "fmla v11.4s, v23.4s, v1.4s\n" - "fmla v8.4s, v23.4s, v4.4s\n" - "str s12, [x24, %[output_col_stride1]]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "str s14, [%[outptr0], x26]\n" - "fmla v11.4s, v17.4s, v3.4s\n" - "fmla v8.4s, v17.4s, v6.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s9, [x25, %[output_col_stride1]]\n" - "fmla v11.4s, v18.4s, v0.4s\n" - "fmla v8.4s, v24.4s, v1.4s\n" - "str s11, [x24, x26]\n" - "fmla v8.4s, v18.4s, v3.4s\n" - "add x24, x24, #4\n" - "fmla v8.4s, v22.4s, v0.4s\n" - "str s8, [x25, x26]\n" - "add x25, x25, #4\n" - "7:\n" - : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x25, %[inptr0], %[input_row_stride]\n" - "add x16, %[input_col_stride1], %[input_col_stride1]\n" - "add x21, %[outptr0], %[output_row_stride]\n" - "add x22, x25, %[input_row_stride]\n" - "add x23, x16, #64\n" - "add x26, x16, %[input_col_stride1]\n" - "add x13, x22, %[input_row_stride]\n" - "add x20, x26, #64\n" - "add x9, x26, %[input_col_stride1]\n" - "add x24, x13, %[input_row_stride]\n" - "add x15, x9, #64\n" - "add x14, x21, %[output_row_stride]\n" - "add x19, %[output_col_stride1], %[output_col_stride1]\n" - "and x27, %[n_channels], #3\n" - "lsr x28, %[n_channels], #2\n" - "cbz x28, 4f\n" - "1:\n" - "ldr q20, [%[wbptr]]\n" - "subs x28, x28, #1\n" - "mov v4.16b, v20.16b\n" - "ldr q15, [%[wbptr], #16]\n" - "mov v1.16b, v20.16b\n" - "ldr q0, [%[wbptr], #32]\n" - "mov v3.16b, v20.16b\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v7.16b, v20.16b\n" - "ldr q16, [%[wbptr], #64]\n" - "mov v9.16b, v20.16b\n" - "ldr q12, [%[wbptr], #80]\n" - "mov v2.16b, v20.16b\n" - "ldr q17, [%[wbptr], #96]\n" - "mov v6.16b, v20.16b\n" - "ldr q11, [%[wbptr], #112]\n" - "mov v8.16b, v20.16b\n" - "ldr q10, [%[wbptr], #128]\n" - "mov v5.16b, v20.16b\n" - "ldr q14, [%[wbptr], #144]\n" - "ldr q27, [%[inptr0]]\n" - "ldr q24, [x25]\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "ldr q21, [x22]\n" - "ldr q19, [x25, %[input_col_stride1]]\n" - "ldr q31, [%[inptr0], x16]\n" - "ldr q28, [x13]\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "ldr q18, [x22, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x25, #64]\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x25, x17]\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "prfm pldl1keep, [x13, #64]\n" - "prfm pldl1keep, [x22, x17]\n" - "beq 3f\n" - "2:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr q24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr q29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr q30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr q25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr q26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr q23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr q20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr q28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr q27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr q24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr q22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr q21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr q19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "add x25, x25, #16\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "prfm pldl1keep, [x25, #64]\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "prfm pldl1keep, [x25, x17]\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "subs x28, x28, #1\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr q26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr q18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #16\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "ldr q23, [x13, x9]\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x22, x17]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "ldr q25, [x24, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr q20, [%[wbptr]]\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x13, x13, #16\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [x13, #64]\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "add x24, x24, #16\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "ldr q15, [%[wbptr], #16]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr q27, [%[inptr0]]\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "ldr q16, [%[wbptr], #64]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "ldr q24, [x25]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "ldr q22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "ldr q0, [%[wbptr], #32]\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "ldr q21, [x22]\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "ldr q11, [%[wbptr], #112]\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "movi v29.16b, #0\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "ldr q12, [%[wbptr], #80]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "ldr q19, [x25, %[input_col_stride1]]\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str q4, [%[outptr0]]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "str q3, [%[outptr0], %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr q13, [%[wbptr], #48]\n" - "str q2, [%[outptr0], x19]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str q1, [x21]\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "ldr q10, [%[wbptr], #128]\n" - "str q9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "str q8, [x21, x19]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "ldr q17, [%[wbptr], #96]\n" - "str q7, [x14]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str q6, [x14, %[output_col_stride1]]\n" - "mov v4.16b, v20.16b\n" - "str q5, [x14, x19]\n" - "mov v1.16b, v20.16b\n" - "mov v3.16b, v20.16b\n" - "ldr q14, [%[wbptr], #144]\n" - "mov v7.16b, v20.16b\n" - "ldr q31, [%[inptr0], x16]\n" - "mov v9.16b, v20.16b\n" - "ldr q28, [x13]\n" - "mov v2.16b, v20.16b\n" - "ldr q18, [x22, %[input_col_stride1]]\n" - "mov v6.16b, v20.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "mov v8.16b, v20.16b\n" - "add x21, x21, #16\n" - "mov v5.16b, v20.16b\n" - "add x14, x14, #16\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "bne 2b\n" - "3:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr q24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr q29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr q30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr q25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr q26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr q23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr q20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr q28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr q27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr q24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr q22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr q21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "add x25, x25, #16\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr q19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr q26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr q18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #16\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "ldr q23, [x13, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr q25, [x24, x9]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "add x13, x13, #16\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x24, x24, #16\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "movi v29.16b, #0\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str q4, [%[outptr0]]\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "str q3, [%[outptr0], %[output_col_stride1]]\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "str q2, [%[outptr0], x19]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q1, [x21]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str q9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str q8, [x21, x19]\n" - "str q7, [x14]\n" - "str q6, [x14, %[output_col_stride1]]\n" - "add x21, x21, #16\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str q5, [x14, x19]\n" - "add x14, x14, #16\n" - "4:\n" - "cbz x27, 7f\n" - "ldr s20, [%[wbptr]]\n" - "mov v4.16b, v20.16b\n" - "ldr s15, [%[wbptr], #4]\n" - "mov v1.16b, v20.16b\n" - "ldr s0, [%[wbptr], #8]\n" - "mov v3.16b, v20.16b\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v7.16b, v20.16b\n" - "ldr s16, [%[wbptr], #16]\n" - "mov v9.16b, v20.16b\n" - "ldr s12, [%[wbptr], #20]\n" - "mov v2.16b, v20.16b\n" - "ldr s17, [%[wbptr], #24]\n" - "mov v6.16b, v20.16b\n" - "ldr s11, [%[wbptr], #28]\n" - "mov v8.16b, v20.16b\n" - "ldr s10, [%[wbptr], #32]\n" - "mov v5.16b, v20.16b\n" - "ldr s14, [%[wbptr], #36]\n" - "ldr s27, [%[inptr0]]\n" - "subs x27, x27, #1\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "ldr s24, [x25]\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "ldr s21, [x22]\n" - "ldr s19, [x25, %[input_col_stride1]]\n" - "ldr s31, [%[inptr0], x16]\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "ldr s28, [x13]\n" - "ldr s18, [x22, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x25, #64]\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "prfm pldl1keep, [x22, #64]\n" - "prfm pldl1keep, [x25, x17]\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "prfm pldl1keep, [x13, #64]\n" - "prfm pldl1keep, [x22, x17]\n" - "beq 6f\n" - "5:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr s24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr s29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr s30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr s25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr s26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr s23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr s20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr s28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr s27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr s24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr s22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr s21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr s19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], x17]\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "add x25, x25, #4\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "prfm pldl1keep, [x25, #64]\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "prfm pldl1keep, [x25, x17]\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "subs x27, x27, #1\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr s26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr s18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #4\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "ldr s23, [x13, x9]\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x22, x17]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "ldr s25, [x24, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr s20, [%[wbptr]]\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x13, x13, #4\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [x13, #64]\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "add x24, x24, #4\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "ldr s15, [%[wbptr], #4]\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "ldr s27, [%[inptr0]]\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "ldr s16, [%[wbptr], #16]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "ldr s24, [x25]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "ldr s22, [%[inptr0], %[input_col_stride1]]\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "ldr s0, [%[wbptr], #8]\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "ldr s21, [x22]\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "ldr s11, [%[wbptr], #28]\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "movi v29.16b, #0\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "ldr s12, [%[wbptr], #20]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "ldr s19, [x25, %[input_col_stride1]]\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str s4, [%[outptr0]]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "str s3, [%[outptr0], %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr s13, [%[wbptr], #12]\n" - "str s2, [%[outptr0], x19]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str s1, [x21]\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "ldr s10, [%[wbptr], #32]\n" - "str s9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "str s8, [x21, x19]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "ldr s17, [%[wbptr], #24]\n" - "str s7, [x14]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str s6, [x14, %[output_col_stride1]]\n" - "mov v4.16b, v20.16b\n" - "str s5, [x14, x19]\n" - "mov v1.16b, v20.16b\n" - "mov v3.16b, v20.16b\n" - "ldr s14, [%[wbptr], #36]\n" - "mov v7.16b, v20.16b\n" - "ldr s31, [%[inptr0], x16]\n" - "mov v9.16b, v20.16b\n" - "ldr s28, [x13]\n" - "mov v2.16b, v20.16b\n" - "ldr s18, [x22, %[input_col_stride1]]\n" - "mov v6.16b, v20.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "mov v8.16b, v20.16b\n" - "add x21, x21, #4\n" - "mov v5.16b, v20.16b\n" - "add x14, x14, #4\n" - "fmla v4.4s, v27.4s, v15.4s\n" - "fmla v4.4s, v24.4s, v16.4s\n" - "bne 5b\n" - "6:\n" - "fmla v1.4s, v24.4s, v15.4s\n" - "ldr s24, [x25, x16]\n" - "fmla v4.4s, v22.4s, v0.4s\n" - "ldr s29, [%[inptr0], x26]\n" - "fmla v3.4s, v22.4s, v15.4s\n" - "ldr s30, [x24]\n" - "fmla v1.4s, v21.4s, v16.4s\n" - "ldr s25, [x13, %[input_col_stride1]]\n" - "fmla v4.4s, v21.4s, v11.4s\n" - "prfm pldl1keep, [x25, x23]\n" - "fmla v7.4s, v21.4s, v15.4s\n" - "ldr s26, [x22, x16]\n" - "fmla v1.4s, v19.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v4.4s, v19.4s, v12.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v3.4s, v19.4s, v16.4s\n" - "prfm pldl1keep, [x13, x17]\n" - "fmla v9.4s, v19.4s, v15.4s\n" - "ldr s23, [x25, x26]\n" - "fmla v4.4s, v31.4s, v13.4s\n" - "prfm pldl1keep, [x22, x23]\n" - "fmla v3.4s, v31.4s, v0.4s\n" - "prfm pldl1keep, [x25, x20]\n" - "fmla v2.4s, v31.4s, v15.4s\n" - "ldr s20, [%[inptr0], x9]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "ldr s28, [x24, %[input_col_stride1]]\n" - "fmla v4.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, x17]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "prfm pldl1keep, [x13, x23]\n" - "fmla v3.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x22, x20]\n" - "fmla v7.4s, v18.4s, v0.4s\n" - "prfm pldl1keep, [x25, x15]\n" - "fmla v9.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x24, x23]\n" - "fmla v6.4s, v18.4s, v15.4s\n" - "ldr s27, [x13, x16]\n" - "fmla v4.4s, v24.4s, v17.4s\n" - "prfm pldl1keep, [x13, x20]\n" - "fmla v1.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x22, x15]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "prfm pldl1keep, [x24, x20]\n" - "fmla v9.4s, v24.4s, v0.4s\n" - "prfm pldl1keep, [x13, x15]\n" - "fmla v2.4s, v24.4s, v16.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v8.4s, v24.4s, v15.4s\n" - "ldr s24, [x22, x26]\n" - "fmla v3.4s, v29.4s, v13.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v2.4s, v29.4s, v0.4s\n" - "ldr s22, [x25, x9]\n" - "fmla v7.4s, v30.4s, v11.4s\n" - "ldr s21, [x24, x16]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v9.4s, v25.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v7.4s, v25.4s, v12.4s\n" - "add x25, x25, #4\n" - "fmla v6.4s, v25.4s, v16.4s\n" - "ldr s19, [x13, x26]\n" - "fmla v4.4s, v26.4s, v14.4s\n" - "fmla v1.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v26.4s, v10.4s\n" - "fmla v7.4s, v26.4s, v13.4s\n" - "fmla v9.4s, v26.4s, v12.4s\n" - "fmla v2.4s, v26.4s, v11.4s\n" - "fmla v6.4s, v26.4s, v0.4s\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "fmla v5.4s, v26.4s, v15.4s\n" - "ldr s26, [x22, x9]\n" - "fmla v3.4s, v23.4s, v17.4s\n" - "ldr s18, [x24, x26]\n" - "fmla v9.4s, v23.4s, v13.4s\n" - "add x22, x22, #4\n" - "fmla v2.4s, v23.4s, v12.4s\n" - "fmla v8.4s, v23.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v10.4s\n" - "ldr s23, [x13, x9]\n" - "fmla v6.4s, v28.4s, v11.4s\n" - "ldr s25, [x24, x9]\n" - "fmla v2.4s, v20.4s, v13.4s\n" - "add x13, x13, #4\n" - "fmla v1.4s, v27.4s, v14.4s\n" - "add x24, x24, #4\n" - "fmla v7.4s, v27.4s, v17.4s\n" - "fmla v9.4s, v27.4s, v10.4s\n" - "fmla v6.4s, v27.4s, v12.4s\n" - "fmla v8.4s, v27.4s, v11.4s\n" - "fmla v5.4s, v27.4s, v16.4s\n" - "fmla v3.4s, v24.4s, v14.4s\n" - "fmla v9.4s, v24.4s, v17.4s\n" - "fmla v2.4s, v24.4s, v10.4s\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "fmla v8.4s, v24.4s, v12.4s\n" - "fmla v5.4s, v24.4s, v0.4s\n" - "fmla v7.4s, v21.4s, v14.4s\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "fmla v9.4s, v19.4s, v14.4s\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "fmla v6.4s, v21.4s, v10.4s\n" - "fmla v5.4s, v21.4s, v11.4s\n" - "movi v29.16b, #0\n" - "fmla v2.4s, v26.4s, v14.4s\n" - "fmla v6.4s, v19.4s, v17.4s\n" - "fmla v8.4s, v19.4s, v10.4s\n" - "fmla v5.4s, v19.4s, v12.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "fmla v6.4s, v18.4s, v14.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str s4, [%[outptr0]]\n" - "fmla v8.4s, v26.4s, v17.4s\n" - "str s3, [%[outptr0], %[output_col_stride1]]\n" - "fmla v5.4s, v26.4s, v13.4s\n" - "str s2, [%[outptr0], x19]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "fmla v8.4s, v23.4s, v14.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s1, [x21]\n" - "fmla v5.4s, v18.4s, v10.4s\n" - "fmax v9.4s, v9.4s, v29.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v8.4s, v8.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str s9, [x21, %[output_col_stride1]]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "str s8, [x21, x19]\n" - "str s7, [x14]\n" - "str s6, [x14, %[output_col_stride1]]\n" - "add x21, x21, #4\n" - "fmla v5.4s, v25.4s, v14.4s\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "str s5, [x14, x19]\n" - "add x14, x14, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x17, %[inptr0], %[input_row_stride]\n" - "add x9, %[input_col_stride1], %[input_col_stride1]\n" - "add x25, %[outptr0], %[output_row_stride]\n" - "add x14, x17, %[input_row_stride]\n" - "add x22, x9, #64\n" - "add x15, x9, %[input_col_stride1]\n" - "add x21, x14, %[input_row_stride]\n" - "add x16, x15, #64\n" - "add x24, x15, %[input_col_stride1]\n" - "add x26, x21, %[input_row_stride]\n" - "add x23, x24, #64\n" - "add x13, x25, %[output_row_stride]\n" - "add x27, %[output_col_stride1], %[output_col_stride1]\n" - "and x19, %[n_channels], #3\n" - "lsr x20, %[n_channels], #2\n" - "cbz x20, 4f\n" - "1:\n" - "ldr q19, [%[wbptr]]\n" - "subs x20, x20, #1\n" - "mov v8.16b, v19.16b\n" - "ldr q17, [%[wbptr], #16]\n" - "mov v5.16b, v19.16b\n" - "ldr q16, [%[wbptr], #32]\n" - "mov v7.16b, v19.16b\n" - "ldr q15, [%[wbptr], #48]\n" - "mov v2.16b, v19.16b\n" - "ldr q14, [%[wbptr], #64]\n" - "mov v4.16b, v19.16b\n" - "ldr q13, [%[wbptr], #80]\n" - "mov v6.16b, v19.16b\n" - "ldr q12, [%[wbptr], #96]\n" - "mov v1.16b, v19.16b\n" - "ldr q11, [%[wbptr], #112]\n" - "mov v3.16b, v19.16b\n" - "ldr q10, [%[wbptr], #128]\n" - "mov v0.16b, v19.16b\n" - "ldr q9, [%[wbptr], #144]\n" - "ldr q25, [%[inptr0]]\n" - "ldr q27, [x17]\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "ldr q20, [x14]\n" - "ldr q22, [x17, %[input_col_stride1]]\n" - "ldr q28, [%[inptr0], x9]\n" - "ldr q23, [x21]\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "ldr q18, [x14, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x17, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x14, #64]\n" - "prfm pldl1keep, [x17, x28]\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x14, x28]\n" - "beq 3f\n" - "2:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr q27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr q30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr q31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr q24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr q29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr q21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr q19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr q28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr q25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr q27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr q26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr q20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr q18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "add x17, x17, #16\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "prfm pldl1keep, [x17, #64]\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "prfm pldl1keep, [x17, x28]\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "subs x20, x20, #1\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr q22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr q23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #16\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x14, #64]\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "ldr q24, [x21, x24]\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x14, x28]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "ldr q21, [x26, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr q19, [%[wbptr]]\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x21, x21, #16\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "add x26, x26, #16\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "ldr q17, [%[wbptr], #16]\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "ldr q25, [%[inptr0]]\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "ldr q14, [%[wbptr], #64]\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "ldr q27, [x17]\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "ldr q16, [%[wbptr], #32]\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "ldr q20, [x14]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "ldr q11, [%[wbptr], #112]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "movi v30.16b, #0\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "ldr q13, [%[wbptr], #80]\n" - "fmov v29.4s, #6.0\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "ldr q15, [%[wbptr], #48]\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "ldr q22, [x17, %[input_col_stride1]]\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "str q8, [%[outptr0]]\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "str q7, [%[outptr0], %[output_col_stride1]]\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "ldr q10, [%[wbptr], #128]\n" - "str q6, [%[outptr0], x27]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "ldr q12, [%[wbptr], #96]\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "ldr q28, [%[inptr0], x9]\n" - "str q5, [x25]\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "ldr q9, [%[wbptr], #144]\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "ldr q23, [x21]\n" - "str q4, [x25, %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str q3, [x25, x27]\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "ldr q18, [x14, %[input_col_stride1]]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q2, [x13]\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "str q1, [x13, %[output_col_stride1]]\n" - "mov v8.16b, v19.16b\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "add x25, x25, #16\n" - "mov v5.16b, v19.16b\n" - "mov v7.16b, v19.16b\n" - "str q0, [x13, x27]\n" - "mov v2.16b, v19.16b\n" - "mov v4.16b, v19.16b\n" - "add x13, x13, #16\n" - "mov v6.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "mov v3.16b, v19.16b\n" - "mov v0.16b, v19.16b\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "bne 2b\n" - "3:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr q27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr q30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr q31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr q24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr q29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr q21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr q19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr q28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr q25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr q27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr q26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr q20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "add x17, x17, #16\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr q18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr q22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr q23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #16\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr q24, [x21, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr q21, [x26, x24]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "add x21, x21, #16\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x26, x26, #16\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "movi v30.16b, #0\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "fmov v29.4s, #6.0\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "str q8, [%[outptr0]]\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "str q7, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "str q6, [%[outptr0], x27]\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "str q5, [x25]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str q4, [x25, %[output_col_stride1]]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "str q3, [x25, x27]\n" - "str q2, [x13]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "add x25, x25, #16\n" - "str q1, [x13, %[output_col_stride1]]\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "str q0, [x13, x27]\n" - "add x13, x13, #16\n" - "4:\n" - "cbz x19, 7f\n" - "ldr s19, [%[wbptr]]\n" - "mov v8.16b, v19.16b\n" - "ldr s17, [%[wbptr], #4]\n" - "mov v5.16b, v19.16b\n" - "ldr s16, [%[wbptr], #8]\n" - "mov v7.16b, v19.16b\n" - "ldr s15, [%[wbptr], #12]\n" - "mov v2.16b, v19.16b\n" - "ldr s14, [%[wbptr], #16]\n" - "mov v4.16b, v19.16b\n" - "ldr s13, [%[wbptr], #20]\n" - "mov v6.16b, v19.16b\n" - "ldr s12, [%[wbptr], #24]\n" - "mov v1.16b, v19.16b\n" - "ldr s11, [%[wbptr], #28]\n" - "mov v3.16b, v19.16b\n" - "ldr s10, [%[wbptr], #32]\n" - "mov v0.16b, v19.16b\n" - "ldr s9, [%[wbptr], #36]\n" - "ldr s25, [%[inptr0]]\n" - "subs x19, x19, #1\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "ldr s27, [x17]\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "ldr s20, [x14]\n" - "ldr s22, [x17, %[input_col_stride1]]\n" - "ldr s28, [%[inptr0], x9]\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "ldr s23, [x21]\n" - "ldr s18, [x14, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x17, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x14, #64]\n" - "prfm pldl1keep, [x17, x28]\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "prfm pldl1keep, [x21, #64]\n" - "prfm pldl1keep, [x14, x28]\n" - "beq 6f\n" - "5:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr s27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr s30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr s31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr s24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr s29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr s21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr s19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr s28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr s25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr s27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr s26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr s20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr s18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "add x17, x17, #4\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "prfm pldl1keep, [x17, #64]\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "prfm pldl1keep, [x17, x28]\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "subs x19, x19, #1\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr s22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr s23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #4\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x14, #64]\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "ldr s24, [x21, x24]\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "prfm pldl1keep, [x14, x28]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "ldr s21, [x26, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr s19, [%[wbptr]]\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x21, x21, #4\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x21, #64]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "add x26, x26, #4\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "ldr s17, [%[wbptr], #4]\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "ldr s25, [%[inptr0]]\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "ldr s14, [%[wbptr], #16]\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "ldr s27, [x17]\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "ldr s16, [%[wbptr], #8]\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "ldr s20, [x14]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "ldr s11, [%[wbptr], #28]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "movi v30.16b, #0\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "ldr s13, [%[wbptr], #20]\n" - "fmov v29.4s, #6.0\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "ldr s15, [%[wbptr], #12]\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "ldr s22, [x17, %[input_col_stride1]]\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "str s8, [%[outptr0]]\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "str s7, [%[outptr0], %[output_col_stride1]]\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "ldr s10, [%[wbptr], #32]\n" - "str s6, [%[outptr0], x27]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "ldr s12, [%[wbptr], #24]\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "ldr s28, [%[inptr0], x9]\n" - "str s5, [x25]\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "ldr s9, [%[wbptr], #36]\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "ldr s23, [x21]\n" - "str s4, [x25, %[output_col_stride1]]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str s3, [x25, x27]\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "ldr s18, [x14, %[input_col_stride1]]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s2, [x13]\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "str s1, [x13, %[output_col_stride1]]\n" - "mov v8.16b, v19.16b\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "add x25, x25, #4\n" - "mov v5.16b, v19.16b\n" - "mov v7.16b, v19.16b\n" - "str s0, [x13, x27]\n" - "mov v2.16b, v19.16b\n" - "mov v4.16b, v19.16b\n" - "add x13, x13, #4\n" - "mov v6.16b, v19.16b\n" - "mov v1.16b, v19.16b\n" - "mov v3.16b, v19.16b\n" - "mov v0.16b, v19.16b\n" - "fmla v8.4s, v25.4s, v17.4s\n" - "fmla v8.4s, v27.4s, v14.4s\n" - "bne 5b\n" - "6:\n" - "fmla v5.4s, v27.4s, v17.4s\n" - "ldr s27, [x17, x9]\n" - "fmla v8.4s, v26.4s, v16.4s\n" - "ldr s30, [%[inptr0], x15]\n" - "fmla v7.4s, v26.4s, v17.4s\n" - "ldr s31, [x26]\n" - "fmla v5.4s, v20.4s, v14.4s\n" - "ldr s24, [x21, %[input_col_stride1]]\n" - "fmla v8.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x17, x22]\n" - "fmla v2.4s, v20.4s, v17.4s\n" - "ldr s29, [x14, x9]\n" - "fmla v5.4s, v22.4s, v16.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v8.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v7.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x21, x28]\n" - "fmla v4.4s, v22.4s, v17.4s\n" - "ldr s21, [x17, x15]\n" - "fmla v8.4s, v28.4s, v15.4s\n" - "prfm pldl1keep, [x14, x22]\n" - "fmla v7.4s, v28.4s, v16.4s\n" - "prfm pldl1keep, [x17, x16]\n" - "fmla v6.4s, v28.4s, v17.4s\n" - "ldr s19, [%[inptr0], x24]\n" - "fmla v5.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [%[inptr0], x23]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "ldr s28, [x26, %[input_col_stride1]]\n" - "fmla v8.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x26, x28]\n" - "fmla v5.4s, v18.4s, v13.4s\n" - "prfm pldl1keep, [x21, x22]\n" - "fmla v7.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x14, x16]\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "prfm pldl1keep, [x17, x23]\n" - "fmla v4.4s, v18.4s, v14.4s\n" - "prfm pldl1keep, [x26, x22]\n" - "fmla v1.4s, v18.4s, v17.4s\n" - "ldr s25, [x21, x9]\n" - "fmla v8.4s, v27.4s, v12.4s\n" - "prfm pldl1keep, [x21, x16]\n" - "fmla v5.4s, v27.4s, v15.4s\n" - "prfm pldl1keep, [x14, x23]\n" - "fmla v7.4s, v27.4s, v13.4s\n" - "prfm pldl1keep, [x26, x16]\n" - "fmla v4.4s, v27.4s, v16.4s\n" - "prfm pldl1keep, [x21, x23]\n" - "fmla v6.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [x26, x23]\n" - "fmla v3.4s, v27.4s, v17.4s\n" - "ldr s27, [x14, x15]\n" - "fmla v7.4s, v30.4s, v15.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v6.4s, v30.4s, v16.4s\n" - "ldr s26, [x17, x24]\n" - "fmla v2.4s, v31.4s, v11.4s\n" - "ldr s20, [x26, x9]\n" - "fmla v5.4s, v24.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v4.4s, v24.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v2.4s, v24.4s, v13.4s\n" - "add x17, x17, #4\n" - "fmla v1.4s, v24.4s, v14.4s\n" - "ldr s18, [x21, x15]\n" - "fmla v8.4s, v29.4s, v9.4s\n" - "fmla v5.4s, v29.4s, v12.4s\n" - "fmla v7.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v15.4s\n" - "fmla v4.4s, v29.4s, v13.4s\n" - "fmla v6.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v16.4s\n" - "fmla v3.4s, v29.4s, v14.4s\n" - "fmla v0.4s, v29.4s, v17.4s\n" - "ldr s22, [x14, x24]\n" - "fmla v7.4s, v21.4s, v12.4s\n" - "ldr s23, [x26, x15]\n" - "fmla v4.4s, v21.4s, v15.4s\n" - "add x14, x14, #4\n" - "fmla v6.4s, v21.4s, v13.4s\n" - "fmla v3.4s, v21.4s, v16.4s\n" - "fmla v2.4s, v28.4s, v10.4s\n" - "ldr s24, [x21, x24]\n" - "fmla v1.4s, v28.4s, v11.4s\n" - "ldr s21, [x26, x24]\n" - "fmla v6.4s, v19.4s, v15.4s\n" - "add x21, x21, #4\n" - "fmla v5.4s, v25.4s, v9.4s\n" - "add x26, x26, #4\n" - "fmla v2.4s, v25.4s, v12.4s\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v13.4s\n" - "fmla v3.4s, v25.4s, v11.4s\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "fmla v7.4s, v27.4s, v9.4s\n" - "fmla v4.4s, v27.4s, v12.4s\n" - "fmla v6.4s, v27.4s, v10.4s\n" - "fmla v1.4s, v27.4s, v15.4s\n" - "fmla v3.4s, v27.4s, v13.4s\n" - "fmla v0.4s, v27.4s, v16.4s\n" - "fmla v2.4s, v20.4s, v9.4s\n" - "fmla v6.4s, v26.4s, v12.4s\n" - "fmla v4.4s, v18.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v15.4s\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v0.4s, v20.4s, v11.4s\n" - "movi v30.16b, #0\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "fmov v29.4s, #6.0\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v18.4s, v13.4s\n" - "fmax v8.4s, v8.4s, v30.4s\n" - "fmax v7.4s, v7.4s, v30.4s\n" - "fmax v6.4s, v6.4s, v30.4s\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v0.4s, v22.4s, v15.4s\n" - "fmin v8.4s, v8.4s, v29.4s\n" - "fmin v7.4s, v7.4s, v29.4s\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "str s8, [%[outptr0]]\n" - "fmla v3.4s, v24.4s, v9.4s\n" - "str s7, [%[outptr0], %[output_col_stride1]]\n" - "fmla v0.4s, v23.4s, v10.4s\n" - "str s6, [%[outptr0], x27]\n" - "fmax v5.4s, v5.4s, v30.4s\n" - "fmax v4.4s, v4.4s, v30.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v24.4s, v12.4s\n" - "fmin v5.4s, v5.4s, v29.4s\n" - "fmin v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v30.4s\n" - "str s5, [x25]\n" - "fmax v2.4s, v2.4s, v30.4s\n" - "str s4, [x25, %[output_col_stride1]]\n" - "fmla v0.4s, v21.4s, v9.4s\n" - "fmin v3.4s, v3.4s, v29.4s\n" - "fmin v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v30.4s\n" - "str s3, [x25, x27]\n" - "str s2, [x13]\n" - "fmin v1.4s, v1.4s, v29.4s\n" - "fmax v0.4s, v0.4s, v30.4s\n" - "add x25, x25, #4\n" - "str s1, [x13, %[output_col_stride1]]\n" - "fmin v0.4s, v0.4s, v29.4s\n" - "str s0, [x13, x27]\n" - "add x13, x13, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp deleted file mode 100644 index b798b8cdbe..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp +++ /dev/null @@ -1,769 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void* weight_bias_ptr, - const float* input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float* output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x15, %[inptr0], %[input_row_stride]\n" - "add x26, %[input_col_stride1], %[input_col_stride1]\n" - "add x21, %[outptr0], %[output_row_stride]\n" - "add x16, x15, %[input_row_stride]\n" - "add x27, x26, %[input_col_stride1]\n" - "add x22, x21, %[output_row_stride]\n" - "add x17, x16, %[input_row_stride]\n" - "add x28, x27, %[input_col_stride1]\n" - "add x23, %[output_col_stride1], %[output_col_stride1]\n" - "add x9, x17, %[input_row_stride]\n" - "add x13, x28, %[input_col_stride1]\n" - "and x24, %[n_channels], #3\n" - "add x19, x9, %[input_row_stride]\n" - "add x14, x13, %[input_col_stride1]\n" - "lsr x25, %[n_channels], #2\n" - "add x20, x19, %[input_row_stride]\n" - "cbz x25, 4f\n" - "1:\n" - "ldr q27, [%[wbptr]]\n" - "subs x25, x25, #1\n" - "mov v17.16b, v27.16b\n" - "ldr q6, [%[wbptr], #16]\n" - "mov v16.16b, v27.16b\n" - "ldr q14, [%[wbptr], #32]\n" - "mov v15.16b, v27.16b\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v2.16b, v27.16b\n" - "ldr q12, [%[wbptr], #64]\n" - "mov v4.16b, v27.16b\n" - "ldr q11, [%[wbptr], #80]\n" - "mov v5.16b, v27.16b\n" - "ldr q10, [%[wbptr], #96]\n" - "mov v1.16b, v27.16b\n" - "ldr q9, [%[wbptr], #112]\n" - "mov v3.16b, v27.16b\n" - "ldr q8, [%[wbptr], #128]\n" - "mov v0.16b, v27.16b\n" - "ldr q7, [%[wbptr], #144]\n" - "ldr q29, [%[inptr0]]\n" - "ldr q28, [x15]\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "ldr q22, [x16]\n" - "ldr q20, [x15, %[input_col_stride1]]\n" - "ldr q19, [%[inptr0], x26]\n" - "ldr q30, [x17]\n" - "ldr q18, [x16, %[input_col_stride1]]\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr q21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr q25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr q24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr q23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "subs x25, x25, #1\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr q22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "ldr q30, [x19]\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr q29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr q28, [x17, x26]\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr q24, [x16, x27]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr q25, [x15, x28]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr q19, [%[inptr0], x13]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr q18, [x20]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "ldr q22, [x19, %[input_col_stride1]]\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr q26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr q20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr q27, [x16, x28]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "ldr q30, [x15, x13]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr q24, [%[inptr0], x14]\n" - "str q17, [%[outptr0]]\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr q28, [x20, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "ldr q17, [x19, x26]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr q18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr q25, [x17, x28]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr q22, [x16, x13]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "str q16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "ldr q21, [x15, x14]\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr q23, [x20, x26]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr q19, [x19, x27]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "add x15, x15, #16\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr q29, [x9, x28]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr q27, [x17, x13]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr q28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr q26, [x20, x27]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr q20, [x19, x28]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr q17, [x9, x13]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "ldr q18, [x17, x14]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr q16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x16, x16, #16\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr q15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "add x17, x17, #16\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr q21, [x9, x14]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr q23, [x20, x13]\n" - "str q2, [x22]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr q24, [x19, x14]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "ldr q25, [x20, x14]\n" - "str q4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr q27, [%[wbptr]]\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr q29, [%[inptr0]]\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "ldr q6, [%[wbptr], #16]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr q28, [x15]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr q26, [%[inptr0], %[input_col_stride1]]\n" - "str q5, [%[outptr0], x23]\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "ldr q22, [x16]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "ldr q20, [x15, %[input_col_stride1]]\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "ldr q12, [%[wbptr], #64]\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "ldr q19, [%[inptr0], x26]\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "ldr q30, [x17]\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "ldr q14, [%[wbptr], #32]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "ldr q18, [x16, %[input_col_stride1]]\n" - "str q1, [x22, %[output_col_stride1]]\n" - "mov v17.16b, v27.16b\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "ldr q9, [%[wbptr], #112]\n" - "str q3, [x21, x23]\n" - "mov v16.16b, v27.16b\n" - "mov v15.16b, v27.16b\n" - "add x9, x9, #16\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "ldr q11, [%[wbptr], #80]\n" - "mov v2.16b, v27.16b\n" - "add x19, x19, #16\n" - "mov v4.16b, v27.16b\n" - "add x20, x20, #16\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v5.16b, v27.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "mov v1.16b, v27.16b\n" - "add x21, x21, #16\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "ldr q8, [%[wbptr], #128]\n" - "mov v3.16b, v27.16b\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "ldr q10, [%[wbptr], #96]\n" - "str q0, [x22, x23]\n" - "mov v0.16b, v27.16b\n" - "ldr q7, [%[wbptr], #144]\n" - "add x22, x22, #16\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr q21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr q27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr q25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr q24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr q23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr q22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "ldr q30, [x19]\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr q29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr q28, [x17, x26]\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "ldr q24, [x16, x27]\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr q25, [x15, x28]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "ldr q19, [%[inptr0], x13]\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr q18, [x20]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr q22, [x19, %[input_col_stride1]]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr q26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr q20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "ldr q27, [x16, x28]\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr q30, [x15, x13]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "ldr q24, [%[inptr0], x14]\n" - "str q17, [%[outptr0]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr q28, [x20, %[input_col_stride1]]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr q17, [x19, x26]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr q18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr q25, [x17, x28]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "ldr q22, [x16, x13]\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "str q16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr q21, [x15, x14]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr q23, [x20, x26]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "ldr q19, [x19, x27]\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "add x15, x15, #16\n" - "str q15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr q29, [x9, x28]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "ldr q27, [x17, x13]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr q28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr q26, [x20, x27]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr q20, [x19, x28]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr q17, [x9, x13]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr q18, [x17, x14]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "add x16, x16, #16\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr q16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x17, x17, #16\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr q15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr q21, [x9, x14]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "ldr q23, [x20, x13]\n" - "str q2, [x22]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr q24, [x19, x14]\n" - "str q4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr q25, [x20, x14]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "add x19, x19, #16\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "add x20, x20, #16\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "str q5, [%[outptr0], x23]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "str q1, [x22, %[output_col_stride1]]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "str q3, [x21, x23]\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "add x21, x21, #16\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "str q0, [x22, x23]\n" - "add x22, x22, #16\n" - "4:\n" - "cbz x24, 7f\n" - "ldr s27, [%[wbptr]]\n" - "mov v17.16b, v27.16b\n" - "ldr s6, [%[wbptr], #4]\n" - "mov v16.16b, v27.16b\n" - "ldr s14, [%[wbptr], #8]\n" - "mov v15.16b, v27.16b\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v2.16b, v27.16b\n" - "ldr s12, [%[wbptr], #16]\n" - "mov v4.16b, v27.16b\n" - "ldr s11, [%[wbptr], #20]\n" - "mov v5.16b, v27.16b\n" - "ldr s10, [%[wbptr], #24]\n" - "mov v1.16b, v27.16b\n" - "ldr s9, [%[wbptr], #28]\n" - "mov v3.16b, v27.16b\n" - "ldr s8, [%[wbptr], #32]\n" - "mov v0.16b, v27.16b\n" - "ldr s7, [%[wbptr], #36]\n" - "ldr s29, [%[inptr0]]\n" - "subs x24, x24, #1\n" - "ldr s28, [x15]\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "ldr s22, [x16]\n" - "ldr s20, [x15, %[input_col_stride1]]\n" - "ldr s19, [%[inptr0], x26]\n" - "ldr s30, [x17]\n" - "ldr s18, [x16, %[input_col_stride1]]\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr s21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr s25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr s24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr s23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "subs x24, x24, #1\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr s22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "ldr s30, [x19]\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr s29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr s28, [x17, x26]\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr s24, [x16, x27]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "ldr s25, [x15, x28]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr s19, [%[inptr0], x13]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr s18, [x20]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "ldr s22, [x19, %[input_col_stride1]]\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr s26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr s20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr s27, [x16, x28]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "ldr s30, [x15, x13]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr s24, [%[inptr0], x14]\n" - "str s17, [%[outptr0]]\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr s28, [x20, %[input_col_stride1]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "ldr s17, [x19, x26]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr s18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr s25, [x17, x28]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr s22, [x16, x13]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "str s16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "ldr s21, [x15, x14]\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr s23, [x20, x26]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr s19, [x19, x27]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "add x15, x15, #4\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr s29, [x9, x28]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr s27, [x17, x13]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr s28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr s26, [x20, x27]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr s20, [x19, x28]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr s17, [x9, x13]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "ldr s18, [x17, x14]\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr s16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x16, x16, #4\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr s15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "add x17, x17, #4\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr s21, [x9, x14]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr s23, [x20, x13]\n" - "str s2, [x22]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr s24, [x19, x14]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "ldr s25, [x20, x14]\n" - "str s4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr s27, [%[wbptr]]\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr s29, [%[inptr0]]\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "ldr s6, [%[wbptr], #4]\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "ldr s28, [x15]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr s26, [%[inptr0], %[input_col_stride1]]\n" - "str s5, [%[outptr0], x23]\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "ldr s22, [x16]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "ldr s20, [x15, %[input_col_stride1]]\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "ldr s12, [%[wbptr], #16]\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "ldr s19, [%[inptr0], x26]\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "ldr s30, [x17]\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "ldr s14, [%[wbptr], #8]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "ldr s18, [x16, %[input_col_stride1]]\n" - "str s1, [x22, %[output_col_stride1]]\n" - "mov v17.16b, v27.16b\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "ldr s9, [%[wbptr], #28]\n" - "str s3, [x21, x23]\n" - "mov v16.16b, v27.16b\n" - "mov v15.16b, v27.16b\n" - "add x9, x9, #4\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "ldr s11, [%[wbptr], #20]\n" - "mov v2.16b, v27.16b\n" - "add x19, x19, #4\n" - "mov v4.16b, v27.16b\n" - "add x20, x20, #4\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v5.16b, v27.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "mov v1.16b, v27.16b\n" - "add x21, x21, #4\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "ldr s8, [%[wbptr], #32]\n" - "mov v3.16b, v27.16b\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "ldr s10, [%[wbptr], #24]\n" - "str s0, [x22, x23]\n" - "mov v0.16b, v27.16b\n" - "ldr s7, [%[wbptr], #36]\n" - "add x22, x22, #4\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v29.4s, v6.4s\n" - "ldr s21, [x15, x26]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "ldr s27, [%[inptr0], x27]\n" - "fmla v15.4s, v19.4s, v6.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v28.4s, v12.4s\n" - "ldr s25, [x9]\n" - "fmla v16.4s, v30.4s, v12.4s\n" - "ldr s24, [x17, %[input_col_stride1]]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v17.4s, v26.4s, v14.4s\n" - "ldr s23, [x16, x26]\n" - "fmla v16.4s, v18.4s, v14.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v27.4s, v14.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v17.4s, v22.4s, v9.4s\n" - "ldr s22, [%[inptr0], x28]\n" - "fmla v16.4s, v25.4s, v9.4s\n" - "ldr s30, [x19]\n" - "fmla v15.4s, v23.4s, v9.4s\n" - "fmla v4.4s, v23.4s, v6.4s\n" - "fmla v17.4s, v20.4s, v11.4s\n" - "ldr s29, [x9, %[input_col_stride1]]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "ldr s28, [x17, x26]\n" - "fmla v15.4s, v26.4s, v11.4s\n" - "ldr s24, [x16, x27]\n" - "fmla v17.4s, v19.4s, v13.4s\n" - "ldr s25, [x15, x28]\n" - "fmla v16.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "ldr s19, [%[inptr0], x13]\n" - "fmla v17.4s, v18.4s, v8.4s\n" - "ldr s18, [x20]\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr s22, [x19, %[input_col_stride1]]\n" - "fmla v16.4s, v29.4s, v8.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v17.4s, v21.4s, v10.4s\n" - "ldr s26, [x9, x26]\n" - "fmla v2.4s, v29.4s, v14.4s\n" - "ldr s20, [x17, x27]\n" - "fmla v16.4s, v28.4s, v10.4s\n" - "ldr s27, [x16, x28]\n" - "fmla v17.4s, v23.4s, v7.4s\n" - "ldr s30, [x15, x13]\n" - "fmla v15.4s, v24.4s, v8.4s\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "fmla v5.4s, v25.4s, v12.4s\n" - "ldr s24, [%[inptr0], x14]\n" - "str s17, [%[outptr0]]\n" - "fmla v2.4s, v18.4s, v9.4s\n" - "fmla v15.4s, v25.4s, v10.4s\n" - "ldr s28, [x20, %[input_col_stride1]]\n" - "fmla v5.4s, v19.4s, v14.4s\n" - "ldr s17, [x19, x26]\n" - "fmla v2.4s, v22.4s, v11.4s\n" - "ldr s18, [x9, x27]\n" - "fmla v16.4s, v26.4s, v7.4s\n" - "ldr s25, [x17, x28]\n" - "fmla v4.4s, v26.4s, v9.4s\n" - "ldr s22, [x16, x13]\n" - "fmla v2.4s, v26.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "str s16, [x21]\n" - "fmla v1.4s, v26.4s, v6.4s\n" - "fmla v4.4s, v20.4s, v11.4s\n" - "ldr s21, [x15, x14]\n" - "fmla v15.4s, v27.4s, v7.4s\n" - "ldr s23, [x20, x26]\n" - "fmla v5.4s, v27.4s, v9.4s\n" - "ldr s19, [x19, x27]\n" - "fmla v4.4s, v27.4s, v13.4s\n" - "add x15, x15, #4\n" - "str s15, [%[outptr0], %[output_col_stride1]]\n" - "fmla v3.4s, v27.4s, v6.4s\n" - "fmla v5.4s, v30.4s, v11.4s\n" - "ldr s29, [x9, x28]\n" - "fmla v2.4s, v28.4s, v8.4s\n" - "ldr s27, [x17, x13]\n" - "fmla v1.4s, v17.4s, v12.4s\n" - "ldr s28, [x16, x14]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "ldr s26, [x20, x27]\n" - "fmla v2.4s, v17.4s, v10.4s\n" - "ldr s20, [x19, x28]\n" - "fmla v4.4s, v18.4s, v8.4s\n" - "ldr s17, [x9, x13]\n" - "fmla v1.4s, v18.4s, v14.4s\n" - "ldr s18, [x17, x14]\n" - "fmla v3.4s, v25.4s, v12.4s\n" - "add x16, x16, #4\n" - "fmla v4.4s, v25.4s, v10.4s\n" - "ldr s16, [x20, x28]\n" - "fmla v5.4s, v22.4s, v8.4s\n" - "add x17, x17, #4\n" - "fmla v3.4s, v22.4s, v14.4s\n" - "ldr s15, [x19, x13]\n" - "fmla v2.4s, v23.4s, v7.4s\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "fmla v5.4s, v21.4s, v10.4s\n" - "ldr s21, [x9, x14]\n" - "fmla v4.4s, v29.4s, v7.4s\n" - "ldr s23, [x20, x13]\n" - "str s2, [x22]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v3.4s, v29.4s, v9.4s\n" - "ldr s24, [x19, x14]\n" - "str s4, [x21, %[output_col_stride1]]\n" - "fmla v0.4s, v29.4s, v6.4s\n" - "fmla v1.4s, v29.4s, v13.4s\n" - "ldr s25, [x20, x14]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v5.4s, v28.4s, v7.4s\n" - "add x19, x19, #4\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "add x20, x20, #4\n" - "fmla v3.4s, v28.4s, v13.4s\n" - "fmla v0.4s, v20.4s, v12.4s\n" - "str s5, [%[outptr0], x23]\n" - "fmla v1.4s, v20.4s, v10.4s\n" - "fmla v3.4s, v17.4s, v8.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v0.4s, v17.4s, v14.4s\n" - "fmla v1.4s, v16.4s, v7.4s\n" - "fmla v3.4s, v18.4s, v10.4s\n" - "fmla v0.4s, v16.4s, v9.4s\n" - "str s1, [x22, %[output_col_stride1]]\n" - "fmla v3.4s, v21.4s, v7.4s\n" - "fmla v0.4s, v15.4s, v11.4s\n" - "str s3, [x21, x23]\n" - "fmla v0.4s, v21.4s, v13.4s\n" - "add x21, x21, #4\n" - "fmla v0.4s, v23.4s, v8.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v0.4s, v25.4s, v7.4s\n" - "str s0, [x22, x23]\n" - "add x22, x22, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [inptr0] "+r" (input), [outptr0] "+r" (output) - : [n_channels] "r" ((long long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x13", "x14", "memory" - ); -} -#endif // __aarch64__ - -template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp deleted file mode 100644 index 89d1f2238b..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp +++ /dev/null @@ -1,6018 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ - -using namespace neon_convolution_kernels; -using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; - -#ifdef __aarch64__ -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x8, %[inptr0], %[input_row_stride]\n" - "add x15, %[input_col_stride1], %[input_col_stride1]\n" - "add x23, %[outptr0], %[output_row_stride]\n" - "add x9, x8, %[input_row_stride]\n" - "add x16, x15, #64\n" - "add x17, x15, %[input_col_stride1]\n" - "add x10, x9, %[input_row_stride]\n" - "add x7, x17, #64\n" - "add x19, x17, %[input_col_stride1]\n" - "add x11, x10, %[input_row_stride]\n" - "add x20, x19, #64\n" - "add x21, x19, %[input_col_stride1]\n" - "add x12, x11, %[input_row_stride]\n" - "add x22, x21, #64\n" - "add x24, x23, %[output_row_stride]\n" - "add x25, x24, %[output_row_stride]\n" - "add x26, %[output_col_stride1], %[output_col_stride1]\n" - "and x13, %[n_channels], #3\n" - "add x27, x26, %[output_col_stride1]\n" - "lsr x14, %[n_channels], #2\n" - "cbz x14, 4f\n" - "1:\n" - "ldr q14, [%[wbptr]]\n" - "subs x14, x14, #1\n" - "mov v17.16b, v14.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v23.16b, v14.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v24.16b, v14.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "mov v20.16b, v14.16b\n" - "ldr q9, [%[wbptr], #64]\n" - "mov v16.16b, v14.16b\n" - "ldr q8, [%[wbptr], #80]\n" - "mov v13.16b, v14.16b\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v0.16b, v14.16b\n" - "ldr q6, [%[wbptr], #112]\n" - "mov v1.16b, v14.16b\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v2.16b, v14.16b\n" - "ldr q4, [%[wbptr], #144]\n" - "mov v3.16b, v14.16b\n" - "ldr q29, [%[inptr0]]\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "ldr q28, [x8]\n" - "ldr q30, [%[inptr0], %[input_col_stride1]]\n" - "ldr q25, [x9]\n" - "ldr q26, [x8, %[input_col_stride1]]\n" - "ldr q27, [%[inptr0], x15]\n" - "ldr q15, [x10]\n" - "ldr q18, [x9, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x8, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [x8, x28]\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "prfm pldl1keep, [x10, #64]\n" - "prfm pldl1keep, [x9, x28]\n" - "beq 3f\n" - "2:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr q22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr q29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr q25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr q28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr q19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr q21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr q27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr q18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr q22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr q25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "subs x14, x14, #1\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "str q17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr q29, [x9, x17]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr q26, [x8, x19]\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "str q23, [x23]\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr q28, [%[inptr0], x21]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr q30, [x12, %[input_col_stride1]]\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "str q24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "ldr q27, [x11, x15]\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr q23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr q24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr q29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr q14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #16\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "prfm pldl1keep, [x8, #64]\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "prfm pldl1keep, [x8, x28]\n" - "str q20, [x24]\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "ldr q25, [x11, x17]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "ldr q30, [x10, x19]\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "str q16, [x23, %[output_col_stride1]]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "ldr q26, [x9, x21]\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "ldr q27, [x12, x17]\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr q20, [x11, x19]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "add x9, x9, #16\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "str q13, [%[outptr0], x26]\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "ldr q23, [x10, x21]\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr q24, [x12, x19]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [x9, x28]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr q16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #16\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr q13, [x12, x21]\n" - "str q0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "ldr q14, [%[wbptr]]\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "add x11, x11, #16\n" - "str q1, [x24, %[output_col_stride1]]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "ldr q29, [%[inptr0]]\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "ldr q28, [x8]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "add x12, x12, #16\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "str q2, [x23, x26]\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr q9, [%[wbptr], #64]\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "ldr q30, [%[inptr0], %[input_col_stride1]]\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "ldr q25, [x9]\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "str q3, [%[outptr0], x27]\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "ldr q26, [x8, %[input_col_stride1]]\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "ldr q27, [%[inptr0], x15]\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "ldr q15, [x10]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "ldr q6, [%[wbptr], #112]\n" - "str q18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "ldr q18, [x9, %[input_col_stride1]]\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "ldr q8, [%[wbptr], #80]\n" - "str q17, [x23, x27]\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "add x23, x23, #16\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "ldr q10, [%[wbptr], #48]\n" - "str q19, [x25, x26]\n" - "mov v17.16b, v14.16b\n" - "str q22, [x24, x27]\n" - "mov v23.16b, v14.16b\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v24.16b, v14.16b\n" - "add x24, x24, #16\n" - "mov v20.16b, v14.16b\n" - "mov v16.16b, v14.16b\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v13.16b, v14.16b\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v14.16b\n" - "mov v2.16b, v14.16b\n" - "str q21, [x25, x27]\n" - "mov v3.16b, v14.16b\n" - "ldr q4, [%[wbptr], #144]\n" - "add x25, x25, #16\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "bne 2b\n" - "3:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr q22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr q29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr q25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr q28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr q19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr q21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr q27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr q18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr q22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr q25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "str q17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr q29, [x9, x17]\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "ldr q26, [x8, x19]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr q28, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr q30, [x12, %[input_col_stride1]]\n" - "str q23, [x23]\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "str q24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "ldr q27, [x11, x15]\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr q23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr q24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr q29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr q14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #16\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "str q20, [x24]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "ldr q25, [x11, x17]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "ldr q30, [x10, x19]\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "str q16, [x23, %[output_col_stride1]]\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "ldr q26, [x9, x21]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr q27, [x12, x17]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "ldr q20, [x11, x19]\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "str q13, [%[outptr0], x26]\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr q23, [x10, x21]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "ldr q24, [x12, x19]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr q16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #16\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "add x11, x11, #16\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr q13, [x12, x21]\n" - "str q0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "add x12, x12, #16\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "str q1, [x24, %[output_col_stride1]]\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "str q2, [x23, x26]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "str q3, [%[outptr0], x27]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "str q17, [x23, x27]\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "str q19, [x25, x26]\n" - "add x23, x23, #16\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "str q22, [x24, x27]\n" - "add x24, x24, #16\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "str q21, [x25, x27]\n" - "add x25, x25, #16\n" - "4:\n" - "cbz x13, 7f\n" - "ldr s14, [%[wbptr]]\n" - "mov v17.16b, v14.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v23.16b, v14.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v24.16b, v14.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "mov v20.16b, v14.16b\n" - "ldr s9, [%[wbptr], #16]\n" - "mov v16.16b, v14.16b\n" - "ldr s8, [%[wbptr], #20]\n" - "mov v13.16b, v14.16b\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v0.16b, v14.16b\n" - "ldr s6, [%[wbptr], #28]\n" - "mov v1.16b, v14.16b\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v2.16b, v14.16b\n" - "ldr s4, [%[wbptr], #36]\n" - "mov v3.16b, v14.16b\n" - "ldr s29, [%[inptr0]]\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "ldr s28, [x8]\n" - "ldr s30, [%[inptr0], %[input_col_stride1]]\n" - "subs x13, x13, #1\n" - "ldr s25, [x9]\n" - "ldr s26, [x8, %[input_col_stride1]]\n" - "ldr s27, [%[inptr0], x15]\n" - "ldr s15, [x10]\n" - "ldr s18, [x9, %[input_col_stride1]]\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x8, #64]\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [x8, x28]\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "prfm pldl1keep, [x10, #64]\n" - "prfm pldl1keep, [x9, x28]\n" - "beq 6f\n" - "5:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr s22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr s29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr s25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr s28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr s19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr s21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr s27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr s18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr s22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr s25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "subs x13, x13, #1\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "str s17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr s29, [x9, x17]\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr s26, [x8, x19]\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "str s23, [x23]\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr s28, [%[inptr0], x21]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr s30, [x12, %[input_col_stride1]]\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x28]\n" - "str s24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "ldr s27, [x11, x15]\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr s23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr s24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr s29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr s14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #4\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "prfm pldl1keep, [x8, #64]\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "prfm pldl1keep, [x8, x28]\n" - "str s20, [x24]\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "ldr s25, [x11, x17]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "ldr s30, [x10, x19]\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "str s16, [x23, %[output_col_stride1]]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "ldr s26, [x9, x21]\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "ldr s27, [x12, x17]\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr s20, [x11, x19]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "add x9, x9, #4\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "str s13, [%[outptr0], x26]\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "ldr s23, [x10, x21]\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr s24, [x12, x19]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "prfm pldl1keep, [x9, x28]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr s16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #4\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr s13, [x12, x21]\n" - "str s0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "ldr s14, [%[wbptr]]\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "add x11, x11, #4\n" - "str s1, [x24, %[output_col_stride1]]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "ldr s29, [%[inptr0]]\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "ldr s28, [x8]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "add x12, x12, #4\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "str s2, [x23, x26]\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr s9, [%[wbptr], #16]\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "ldr s30, [%[inptr0], %[input_col_stride1]]\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "ldr s25, [x9]\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "str s3, [%[outptr0], x27]\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "ldr s26, [x8, %[input_col_stride1]]\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "ldr s27, [%[inptr0], x15]\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "ldr s15, [x10]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "ldr s6, [%[wbptr], #28]\n" - "str s18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "ldr s18, [x9, %[input_col_stride1]]\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "ldr s8, [%[wbptr], #20]\n" - "str s17, [x23, x27]\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "add x23, x23, #4\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "ldr s10, [%[wbptr], #12]\n" - "str s19, [x25, x26]\n" - "mov v17.16b, v14.16b\n" - "str s22, [x24, x27]\n" - "mov v23.16b, v14.16b\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v24.16b, v14.16b\n" - "add x24, x24, #4\n" - "mov v20.16b, v14.16b\n" - "mov v16.16b, v14.16b\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v13.16b, v14.16b\n" - "mov v0.16b, v14.16b\n" - "mov v1.16b, v14.16b\n" - "mov v2.16b, v14.16b\n" - "str s21, [x25, x27]\n" - "mov v3.16b, v14.16b\n" - "ldr s4, [%[wbptr], #36]\n" - "add x25, x25, #4\n" - "fmla v17.4s, v29.4s, v12.4s\n" - "bne 5b\n" - "6:\n" - "fmla v17.4s, v28.4s, v9.4s\n" - "prfm pldl1keep, [x8, x16]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr s22, [x8, x15]\n" - "fmla v24.4s, v30.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "ldr s29, [%[inptr0], x17]\n" - "fmla v23.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x11, #64]\n" - "fmla v20.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x10, x28]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "ldr s25, [x11]\n" - "fmla v23.4s, v26.4s, v11.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [x8, x7]\n" - "fmla v17.4s, v26.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x20]\n" - "fmla v16.4s, v26.4s, v12.4s\n" - "ldr s28, [x10, %[input_col_stride1]]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "prfm pldl1keep, [x12, #64]\n" - "fmla v17.4s, v27.4s, v10.4s\n" - "prfm pldl1keep, [x11, x28]\n" - "fmla v13.4s, v27.4s, v12.4s\n" - "ldr s19, [x9, x15]\n" - "fmla v23.4s, v15.4s, v6.4s\n" - "prfm pldl1keep, [x10, x16]\n" - "fmla v20.4s, v15.4s, v9.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v0.4s, v15.4s, v12.4s\n" - "ldr s21, [x8, x17]\n" - "fmla v17.4s, v18.4s, v5.4s\n" - "prfm pldl1keep, [x8, x20]\n" - "fmla v23.4s, v18.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x22]\n" - "fmla v24.4s, v18.4s, v6.4s\n" - "prfm pldl1keep, [x12, x28]\n" - "fmla v20.4s, v18.4s, v11.4s\n" - "prfm pldl1keep, [x11, x16]\n" - "fmla v16.4s, v18.4s, v9.4s\n" - "prfm pldl1keep, [x10, x7]\n" - "fmla v1.4s, v18.4s, v12.4s\n" - "ldr s27, [%[inptr0], x19]\n" - "fmla v17.4s, v22.4s, v7.4s\n" - "prfm pldl1keep, [x9, x20]\n" - "fmla v23.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x8, x22]\n" - "fmla v24.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x12, x16]\n" - "fmla v16.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x11, x7]\n" - "fmla v13.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x20]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "ldr s18, [x12]\n" - "fmla v24.4s, v29.4s, v10.4s\n" - "prfm pldl1keep, [x9, x22]\n" - "fmla v13.4s, v29.4s, v11.4s\n" - "prfm pldl1keep, [x12, x7]\n" - "fmla v3.4s, v29.4s, v12.4s\n" - "ldr s22, [x11, %[input_col_stride1]]\n" - "fmla v20.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x11, x20]\n" - "fmla v0.4s, v25.4s, v9.4s\n" - "ldr s25, [x10, x15]\n" - "fmla v23.4s, v28.4s, v5.4s\n" - "prfm pldl1keep, [x10, x22]\n" - "fmla v20.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x12, x20]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "prfm pldl1keep, [x11, x22]\n" - "fmla v0.4s, v28.4s, v11.4s\n" - "prfm pldl1keep, [x12, x22]\n" - "fmla v1.4s, v28.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v17.4s, v19.4s, v4.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v23.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v10.4s\n" - "fmla v16.4s, v19.4s, v8.4s\n" - "str s17, [%[outptr0]]\n" - "mov v15.16b, v14.16b\n" - "fmla v13.4s, v19.4s, v6.4s\n" - "fmla v1.4s, v19.4s, v11.4s\n" - "fmla v15.4s, v28.4s, v12.4s\n" - "ldr s29, [x9, x17]\n" - "fmla v2.4s, v19.4s, v9.4s\n" - "fmla v24.4s, v21.4s, v7.4s\n" - "fmla v16.4s, v21.4s, v10.4s\n" - "fmla v13.4s, v21.4s, v8.4s\n" - "fmla v3.4s, v21.4s, v9.4s\n" - "fmla v0.4s, v18.4s, v6.4s\n" - "mov v18.16b, v14.16b\n" - "fmla v2.4s, v21.4s, v11.4s\n" - "fmla v13.4s, v27.4s, v10.4s\n" - "fmla v20.4s, v22.4s, v5.4s\n" - "fmla v18.4s, v19.4s, v12.4s\n" - "ldr s26, [x8, x19]\n" - "fmla v3.4s, v27.4s, v11.4s\n" - "ldr s28, [%[inptr0], x21]\n" - "fmla v0.4s, v22.4s, v8.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v22.4s, v6.4s\n" - "fmla v15.4s, v22.4s, v9.4s\n" - "mov v17.16b, v14.16b\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v20.4s, v25.4s, v7.4s\n" - "fmla v16.4s, v25.4s, v5.4s\n" - "fmla v17.4s, v21.4s, v12.4s\n" - "ldr s30, [x12, %[input_col_stride1]]\n" - "str s23, [x23]\n" - "mov v19.16b, v14.16b\n" - "fmla v0.4s, v25.4s, v10.4s\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "fmla v2.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "mov v22.16b, v14.16b\n" - "mov v21.16b, v14.16b\n" - "fmla v24.4s, v29.4s, v4.4s\n" - "fmla v16.4s, v29.4s, v7.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v1.4s, v29.4s, v10.4s\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v3.4s, v29.4s, v6.4s\n" - "str s24, [%[outptr0], %[output_col_stride1]]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "ldr s27, [x11, x15]\n" - "fmla v22.4s, v29.4s, v12.4s\n" - "ldr s23, [x10, x17]\n" - "fmla v13.4s, v26.4s, v7.4s\n" - "fmla v2.4s, v26.4s, v10.4s\n" - "fmla v3.4s, v26.4s, v8.4s\n" - "fmla v17.4s, v26.4s, v11.4s\n" - "fmla v0.4s, v30.4s, v5.4s\n" - "ldr s24, [x9, x19]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "ldr s29, [x8, x21]\n" - "fmla v3.4s, v28.4s, v10.4s\n" - "ldr s14, [x12, x15]\n" - "fmla v20.4s, v27.4s, v4.4s\n" - "add x8, x8, #4\n" - "fmla v0.4s, v27.4s, v7.4s\n" - "fmla v1.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v6.4s\n" - "str s20, [x24]\n" - "fmla v19.4s, v27.4s, v9.4s\n" - "fmla v16.4s, v23.4s, v4.4s\n" - "ldr s25, [x11, x17]\n" - "fmla v1.4s, v23.4s, v7.4s\n" - "ldr s30, [x10, x19]\n" - "fmla v2.4s, v23.4s, v5.4s\n" - "fmla v15.4s, v23.4s, v10.4s\n" - "str s16, [x23, %[output_col_stride1]]\n" - "fmla v18.4s, v23.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v6.4s\n" - "ldr s26, [x9, x21]\n" - "fmla v19.4s, v23.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v22.4s, v23.4s, v9.4s\n" - "fmla v21.4s, v23.4s, v12.4s\n" - "fmla v13.4s, v24.4s, v4.4s\n" - "ldr s27, [x12, x17]\n" - "fmla v2.4s, v24.4s, v7.4s\n" - "ldr s20, [x11, x19]\n" - "fmla v3.4s, v24.4s, v5.4s\n" - "fmla v18.4s, v24.4s, v10.4s\n" - "str s13, [%[outptr0], x26]\n" - "fmla v17.4s, v24.4s, v8.4s\n" - "fmla v22.4s, v24.4s, v11.4s\n" - "ldr s23, [x10, x21]\n" - "fmla v3.4s, v29.4s, v7.4s\n" - "ldr s24, [x12, x19]\n" - "fmla v17.4s, v29.4s, v10.4s\n" - "ldr s16, [x11, x21]\n" - "fmla v0.4s, v14.4s, v4.4s\n" - "add x10, x10, #4\n" - "fmla v15.4s, v14.4s, v5.4s\n" - "add x11, x11, #4\n" - "fmla v19.4s, v14.4s, v6.4s\n" - "ldr s13, [x12, x21]\n" - "str s0, [x25]\n" - "fmla v1.4s, v25.4s, v4.4s\n" - "fmla v15.4s, v25.4s, v7.4s\n" - "add x12, x12, #4\n" - "fmla v18.4s, v25.4s, v5.4s\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "str s1, [x24, %[output_col_stride1]]\n" - "fmla v22.4s, v25.4s, v6.4s\n" - "fmla v21.4s, v25.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "fmla v17.4s, v30.4s, v5.4s\n" - "fmla v19.4s, v30.4s, v10.4s\n" - "fmla v22.4s, v30.4s, v8.4s\n" - "str s2, [x23, x26]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "fmla v3.4s, v26.4s, v4.4s\n" - "fmla v17.4s, v26.4s, v7.4s\n" - "fmla v22.4s, v26.4s, v10.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v19.4s, v27.4s, v5.4s\n" - "fmla v21.4s, v27.4s, v6.4s\n" - "str s3, [%[outptr0], x27]\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmla v22.4s, v20.4s, v5.4s\n" - "fmla v19.4s, v20.4s, v7.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s18, [x24, x26]\n" - "fmla v21.4s, v20.4s, v8.4s\n" - "fmla v17.4s, v23.4s, v4.4s\n" - "fmla v22.4s, v23.4s, v7.4s\n" - "fmla v19.4s, v24.4s, v4.4s\n" - "fmla v21.4s, v23.4s, v10.4s\n" - "str s17, [x23, x27]\n" - "fmla v22.4s, v16.4s, v4.4s\n" - "str s19, [x25, x26]\n" - "add x23, x23, #4\n" - "fmla v21.4s, v24.4s, v5.4s\n" - "str s22, [x24, x27]\n" - "add x24, x24, #4\n" - "fmla v21.4s, v16.4s, v7.4s\n" - "fmla v21.4s, v13.4s, v4.4s\n" - "str s21, [x25, x27]\n" - "add x25, x25, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[6][6], - float *outptrs[4][4] -) -{ - __asm __volatile( - "mov x27, xzr\n" - "mov x28, xzr\n" - "and x15, %[n_channels], #3\n" - "lsr x16, %[n_channels], #2\n" - "cbz x16, 4f\n" - "1:\n" - "ldr q13, [%[wbptr]]\n" - "ldr x17, [%[inptrs], 0]\n" - "mov v18.16b, v13.16b\n" - "ldr q12, [%[wbptr], #16]\n" - "mov v22.16b, v13.16b\n" - "ldr q11, [%[wbptr], #32]\n" - "mov v23.16b, v13.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "mov v19.16b, v13.16b\n" - "ldr q9, [%[wbptr], #64]\n" - "mov v17.16b, v13.16b\n" - "ldr q8, [%[wbptr], #80]\n" - "mov v14.16b, v13.16b\n" - "ldr q7, [%[wbptr], #96]\n" - "mov v0.16b, v13.16b\n" - "ldr q6, [%[wbptr], #112]\n" - "mov v1.16b, v13.16b\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v2.16b, v13.16b\n" - "ldr q4, [%[wbptr], #144]\n" - "ldr q29, [x17, x27]\n" - "ldr x7, [%[inptrs], 48]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr x17, [%[inptrs], 8]\n" - "ldr q27, [x7, x27]\n" - "ldr x19, [%[inptrs], 96]\n" - "ldr q28, [x17, x27]\n" - "ldr x7, [%[inptrs], 56]\n" - "ldr q25, [x19, x27]\n" - "ldr x17, [%[inptrs], 16]\n" - "ldr q16, [x7, x27]\n" - "ldr x20, [%[inptrs], 144]\n" - "ldr q15, [x17, x27]\n" - "ldr x19, [%[inptrs], 104]\n" - "ldr q21, [x20, x27]\n" - "subs x16, x16, #1\n" - "ldr q29, [x19, x27]\n" - "beq 3f\n" - "2:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr q30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr q31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr q25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr q21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr q20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr q26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "subs x16, x16, #1\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr q27, [x22, x27]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr q30, [x21, x27]\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "ldr q24, [x20, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str q18, [x23, x28]\n" - "mov v16.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr q25, [x19, x27]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr q27, [x17, x27]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "ldr q28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "ldr x17, [%[inptrs], 0]\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "str q22, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr q22, [x21, x27]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "str q23, [x23, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr q30, [x20, x27]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr q31, [x19, x27]\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr q26, [x7, x27]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr q23, [x22, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x7, [%[inptrs], 48]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "str q19, [x25, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr q27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "ldr q28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str q17, [x24, x28]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "ldr q22, [x19, x27]\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x19, [%[inptrs], 96]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr q30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr q19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str q14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr q31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr q17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr q14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr q26, [x22, x27]\n" - "str q0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr q13, [%[wbptr]]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "add x27, x27, #16\n" - "str q1, [x25, x28]\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "ldr q12, [%[wbptr], #16]\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr q29, [x17, x27]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "ldr x17, [%[inptrs], 8]\n" - "str q2, [x24, x28]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "ldr q9, [%[wbptr], #64]\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "ldr q28, [x17, x27]\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr q25, [x19, x27]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "ldr q11, [%[wbptr], #32]\n" - "str q3, [x23, x28]\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "ldr x7, [%[inptrs], 56]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "ldr x17, [%[inptrs], 16]\n" - "str q16, [x26, x28]\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "ldr q16, [x7, x27]\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "ldr q6, [%[wbptr], #112]\n" - "str q15, [x25, x28]\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "ldr q15, [x17, x27]\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "ldr q8, [%[wbptr], #80]\n" - "str q18, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "mov v22.16b, v13.16b\n" - "ldr x20, [%[inptrs], 144]\n" - "str q21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "mov v23.16b, v13.16b\n" - "ldr q10, [%[wbptr], #48]\n" - "str q24, [x25, x28]\n" - "mov v19.16b, v13.16b\n" - "mov v17.16b, v13.16b\n" - "ldr q21, [x20, x27]\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "ldr q5, [%[wbptr], #128]\n" - "mov v14.16b, v13.16b\n" - "ldr x26, [%[outptrs], 120]\n" - "mov v0.16b, v13.16b\n" - "ldr x19, [%[inptrs], 104]\n" - "mov v1.16b, v13.16b\n" - "mov v2.16b, v13.16b\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "ldr q7, [%[wbptr], #96]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr q29, [x19, x27]\n" - "str q20, [x26, x28]\n" - "ldr q4, [%[wbptr], #144]\n" - "add x28, x28, #16\n" - "bne 2b\n" - "3:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr q30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr q31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr q25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr q21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr q20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr q26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "mov v16.16b, v13.16b\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr q27, [x22, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr q30, [x21, x27]\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str q18, [x23, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr q24, [x20, x27]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr q25, [x19, x27]\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr q27, [x17, x27]\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "ldr q28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "str q22, [x24, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr q22, [x21, x27]\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "str q23, [x23, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr q30, [x20, x27]\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr q31, [x19, x27]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr q26, [x7, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr q23, [x22, x27]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "str q19, [x25, x28]\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "ldr q27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr q28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str q17, [x24, x28]\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr q22, [x19, x27]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr q30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr q19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str q14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr q31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr q17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr q14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr q26, [x22, x27]\n" - "str q0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "add x27, x27, #16\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "str q1, [x25, x28]\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "str q2, [x24, x28]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "str q3, [x23, x28]\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "str q16, [x26, x28]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "str q15, [x25, x28]\n" - "str q18, [x24, x28]\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "str q21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "ldr x26, [%[outptrs], 120]\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "str q24, [x25, x28]\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "str q20, [x26, x28]\n" - "add x28, x28, #16\n" - "4:\n" - "cbz x15, 7f\n" - "ldr s13, [%[wbptr]]\n" - "mov v18.16b, v13.16b\n" - "ldr s12, [%[wbptr], #4]\n" - "mov v22.16b, v13.16b\n" - "ldr s11, [%[wbptr], #8]\n" - "mov v23.16b, v13.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "mov v19.16b, v13.16b\n" - "ldr s9, [%[wbptr], #16]\n" - "mov v17.16b, v13.16b\n" - "ldr s8, [%[wbptr], #20]\n" - "mov v14.16b, v13.16b\n" - "ldr s7, [%[wbptr], #24]\n" - "mov v0.16b, v13.16b\n" - "ldr s6, [%[wbptr], #28]\n" - "mov v1.16b, v13.16b\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v2.16b, v13.16b\n" - "ldr s4, [%[wbptr], #36]\n" - "ldr x17, [%[inptrs], 0]\n" - "ldr x7, [%[inptrs], 48]\n" - "ldr x19, [%[inptrs], 96]\n" - "ldr x20, [%[inptrs], 144]\n" - "subs x15, x15, #1\n" - "ldr s29, [x17, x27]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr s27, [x7, x27]\n" - "ldr s25, [x19, x27]\n" - "ldr x17, [%[inptrs], 8]\n" - "ldr s21, [x20, x27]\n" - "ldr x7, [%[inptrs], 56]\n" - "ldr s28, [x17, x27]\n" - "ldr x19, [%[inptrs], 104]\n" - "ldr s16, [x7, x27]\n" - "ldr x17, [%[inptrs], 16]\n" - "ldr s29, [x19, x27]\n" - "ldr s15, [x17, x27]\n" - "beq 6f\n" - "5:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr s30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr s31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr s25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr s21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr s20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "subs x15, x15, #1\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "ldr s27, [x22, x27]\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr s30, [x21, x27]\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "ldr s24, [x20, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str s18, [x23, x28]\n" - "mov v16.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr s25, [x19, x27]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr s27, [x17, x27]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "ldr s28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "ldr x17, [%[inptrs], 0]\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "str s22, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr s22, [x21, x27]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "str s23, [x23, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr s30, [x20, x27]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr s31, [x19, x27]\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr s26, [x7, x27]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr s23, [x22, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x7, [%[inptrs], 48]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "str s19, [x25, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr s27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "ldr s28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str s17, [x24, x28]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "ldr s22, [x19, x27]\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x19, [%[inptrs], 96]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr s30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr s19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str s14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr s31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr s17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr s14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr s26, [x22, x27]\n" - "str s0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr s13, [%[wbptr]]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "add x27, x27, #4\n" - "str s1, [x25, x28]\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "ldr s12, [%[wbptr], #4]\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr s29, [x17, x27]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "ldr x17, [%[inptrs], 8]\n" - "str s2, [x24, x28]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "ldr s9, [%[wbptr], #16]\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "ldr s28, [x17, x27]\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr s25, [x19, x27]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "ldr s11, [%[wbptr], #8]\n" - "str s3, [x23, x28]\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "ldr x7, [%[inptrs], 56]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "ldr x17, [%[inptrs], 16]\n" - "str s16, [x26, x28]\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "ldr s16, [x7, x27]\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "ldr s6, [%[wbptr], #28]\n" - "str s15, [x25, x28]\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "ldr s15, [x17, x27]\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "ldr s8, [%[wbptr], #20]\n" - "str s18, [x24, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "mov v22.16b, v13.16b\n" - "ldr x20, [%[inptrs], 144]\n" - "str s21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "mov v23.16b, v13.16b\n" - "ldr s10, [%[wbptr], #12]\n" - "str s24, [x25, x28]\n" - "mov v19.16b, v13.16b\n" - "mov v17.16b, v13.16b\n" - "ldr s21, [x20, x27]\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "ldr s5, [%[wbptr], #32]\n" - "mov v14.16b, v13.16b\n" - "ldr x26, [%[outptrs], 120]\n" - "mov v0.16b, v13.16b\n" - "ldr x19, [%[inptrs], 104]\n" - "mov v1.16b, v13.16b\n" - "mov v2.16b, v13.16b\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "ldr s7, [%[wbptr], #24]\n" - "fmla v18.4s, v29.4s, v12.4s\n" - "ldr s29, [x19, x27]\n" - "str s20, [x26, x28]\n" - "ldr s4, [%[wbptr], #36]\n" - "add x28, x28, #4\n" - "bne 5b\n" - "6:\n" - "mov v3.16b, v13.16b\n" - "ldr x7, [%[inptrs], 64]\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "ldr x17, [%[inptrs], 24]\n" - "fmla v22.4s, v27.4s, v12.4s\n" - "ldr s30, [x7, x27]\n" - "fmla v23.4s, v28.4s, v12.4s\n" - "ldr x21, [%[inptrs], 192]\n" - "fmla v19.4s, v25.4s, v12.4s\n" - "ldr x20, [%[inptrs], 152]\n" - "fmla v18.4s, v28.4s, v11.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v22.4s, v25.4s, v9.4s\n" - "ldr x19, [%[inptrs], 112]\n" - "fmla v23.4s, v16.4s, v9.4s\n" - "ldr x7, [%[inptrs], 72]\n" - "fmla v17.4s, v16.4s, v12.4s\n" - "ldr x17, [%[inptrs], 32]\n" - "fmla v18.4s, v25.4s, v6.4s\n" - "ldr s31, [x21, x27]\n" - "fmla v22.4s, v16.4s, v11.4s\n" - "ldr x22, [%[inptrs], 240]\n" - "fmla v23.4s, v15.4s, v11.4s\n" - "ldr x21, [%[inptrs], 200]\n" - "fmla v14.4s, v15.4s, v12.4s\n" - "ldr x23, [%[outptrs], 0]\n" - "fmla v18.4s, v16.4s, v8.4s\n" - "ldr s25, [x20, x27]\n" - "fmla v22.4s, v21.4s, v6.4s\n" - "ldr x20, [%[inptrs], 160]\n" - "fmla v19.4s, v21.4s, v9.4s\n" - "ldr x24, [%[outptrs], 32]\n" - "fmla v0.4s, v21.4s, v12.4s\n" - "ldr s21, [x19, x27]\n" - "fmla v18.4s, v15.4s, v10.4s\n" - "ldr s20, [x7, x27]\n" - "fmla v22.4s, v29.4s, v8.4s\n" - "ldr x19, [%[inptrs], 120]\n" - "fmla v23.4s, v29.4s, v6.4s\n" - "ldr x7, [%[inptrs], 80]\n" - "fmla v19.4s, v29.4s, v11.4s\n" - "ldr x25, [%[outptrs], 64]\n" - "fmla v18.4s, v29.4s, v5.4s\n" - "ldr x26, [%[outptrs], 96]\n" - "fmla v17.4s, v29.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v22.4s, v30.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v18.4s, v30.4s, v7.4s\n" - "ldr x17, [%[inptrs], 40]\n" - "fmla v23.4s, v30.4s, v8.4s\n" - "fmla v17.4s, v30.4s, v11.4s\n" - "fmla v14.4s, v30.4s, v9.4s\n" - "fmla v2.4s, v30.4s, v12.4s\n" - "mov v16.16b, v13.16b\n" - "fmla v3.4s, v24.4s, v12.4s\n" - "fmla v19.4s, v31.4s, v6.4s\n" - "fmla v0.4s, v31.4s, v9.4s\n" - "mov v15.16b, v13.16b\n" - "fmla v23.4s, v24.4s, v10.4s\n" - "fmla v14.4s, v24.4s, v11.4s\n" - "ldr s27, [x22, x27]\n" - "fmla v22.4s, v25.4s, v5.4s\n" - "ldr x22, [%[inptrs], 248]\n" - "fmla v19.4s, v25.4s, v8.4s\n" - "fmla v17.4s, v25.4s, v6.4s\n" - "fmla v0.4s, v25.4s, v11.4s\n" - "fmla v1.4s, v25.4s, v9.4s\n" - "fmla v16.4s, v25.4s, v12.4s\n" - "ldr s30, [x21, x27]\n" - "fmla v18.4s, v21.4s, v4.4s\n" - "ldr x21, [%[inptrs], 208]\n" - "fmla v22.4s, v21.4s, v7.4s\n" - "fmla v23.4s, v21.4s, v5.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v17.4s, v21.4s, v8.4s\n" - "fmla v14.4s, v21.4s, v6.4s\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "str s18, [x23, x28]\n" - "mov v18.16b, v13.16b\n" - "fmla v2.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 8]\n" - "fmla v15.4s, v21.4s, v12.4s\n" - "ldr s24, [x20, x27]\n" - "fmla v23.4s, v20.4s, v7.4s\n" - "ldr x20, [%[inptrs], 168]\n" - "fmla v17.4s, v20.4s, v10.4s\n" - "fmla v14.4s, v20.4s, v8.4s\n" - "fmla v2.4s, v20.4s, v11.4s\n" - "fmla v3.4s, v20.4s, v9.4s\n" - "fmla v18.4s, v20.4s, v12.4s\n" - "ldr s25, [x19, x27]\n" - "fmla v0.4s, v27.4s, v6.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v14.4s, v26.4s, v10.4s\n" - "ldr x19, [%[inptrs], 128]\n" - "fmla v3.4s, v26.4s, v11.4s\n" - "ldr s27, [x17, x27]\n" - "fmla v19.4s, v30.4s, v5.4s\n" - "ldr x7, [%[inptrs], 88]\n" - "fmla v0.4s, v30.4s, v8.4s\n" - "fmla v1.4s, v30.4s, v6.4s\n" - "fmla v16.4s, v30.4s, v9.4s\n" - "ldr s28, [x22, x27]\n" - "fmla v22.4s, v24.4s, v4.4s\n" - "ldr x22, [%[inptrs], 256]\n" - "fmla v19.4s, v24.4s, v7.4s\n" - "fmla v17.4s, v24.4s, v5.4s\n" - "fmla v0.4s, v24.4s, v10.4s\n" - "fmla v1.4s, v24.4s, v8.4s\n" - "fmla v2.4s, v24.4s, v6.4s\n" - "fmla v16.4s, v24.4s, v11.4s\n" - "str s22, [x24, x28]\n" - "mov v21.16b, v13.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr x24, [%[outptrs], 40]\n" - "fmla v23.4s, v25.4s, v4.4s\n" - "fmla v17.4s, v25.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v12.4s\n" - "ldr s22, [x21, x27]\n" - "fmla v14.4s, v25.4s, v5.4s\n" - "ldr x21, [%[inptrs], 216]\n" - "fmla v1.4s, v25.4s, v10.4s\n" - "fmla v2.4s, v25.4s, v8.4s\n" - "str s23, [x23, x28]\n" - "mov v24.16b, v13.16b\n" - "mov v20.16b, v13.16b\n" - "ldr x23, [%[outptrs], 16]\n" - "fmla v3.4s, v25.4s, v6.4s\n" - "fmla v15.4s, v25.4s, v11.4s\n" - "fmla v18.4s, v25.4s, v9.4s\n" - "fmla v24.4s, v25.4s, v12.4s\n" - "fmla v14.4s, v29.4s, v7.4s\n" - "ldr s30, [x20, x27]\n" - "fmla v2.4s, v29.4s, v10.4s\n" - "ldr x20, [%[inptrs], 176]\n" - "fmla v3.4s, v29.4s, v8.4s\n" - "fmla v0.4s, v28.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v11.4s\n" - "ldr s31, [x19, x27]\n" - "fmla v16.4s, v28.4s, v6.4s\n" - "ldr s26, [x7, x27]\n" - "fmla v19.4s, v22.4s, v4.4s\n" - "ldr x19, [%[inptrs], 136]\n" - "fmla v3.4s, v27.4s, v10.4s\n" - "ldr s23, [x22, x27]\n" - "fmla v0.4s, v22.4s, v7.4s\n" - "ldr x22, [%[inptrs], 264]\n" - "fmla v1.4s, v22.4s, v5.4s\n" - "fmla v16.4s, v22.4s, v8.4s\n" - "str s19, [x25, x28]\n" - "fmla v15.4s, v22.4s, v6.4s\n" - "fmla v21.4s, v22.4s, v9.4s\n" - "ldr s27, [x21, x27]\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "ldr s28, [x20, x27]\n" - "fmla v1.4s, v30.4s, v7.4s\n" - "ldr x21, [%[inptrs], 224]\n" - "fmla v2.4s, v30.4s, v5.4s\n" - "ldr x20, [%[inptrs], 184]\n" - "fmla v16.4s, v30.4s, v10.4s\n" - "ldr x25, [%[outptrs], 72]\n" - "str s17, [x24, x28]\n" - "fmla v15.4s, v30.4s, v8.4s\n" - "fmla v18.4s, v30.4s, v6.4s\n" - "ldr s22, [x19, x27]\n" - "fmla v21.4s, v30.4s, v11.4s\n" - "ldr x24, [%[outptrs], 48]\n" - "fmla v24.4s, v30.4s, v9.4s\n" - "fmla v20.4s, v30.4s, v12.4s\n" - "fmla v14.4s, v31.4s, v4.4s\n" - "ldr s30, [x22, x27]\n" - "fmla v2.4s, v31.4s, v7.4s\n" - "ldr s19, [x21, x27]\n" - "fmla v3.4s, v31.4s, v5.4s\n" - "ldr x22, [%[inptrs], 272]\n" - "fmla v15.4s, v31.4s, v10.4s\n" - "ldr x21, [%[inptrs], 232]\n" - "str s14, [x23, x28]\n" - "fmla v18.4s, v31.4s, v8.4s\n" - "fmla v24.4s, v31.4s, v11.4s\n" - "ldr s31, [x20, x27]\n" - "fmla v3.4s, v26.4s, v7.4s\n" - "ldr s17, [x22, x27]\n" - "fmla v0.4s, v23.4s, v4.4s\n" - "ldr x22, [%[inptrs], 280]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr s14, [x21, x27]\n" - "fmla v16.4s, v23.4s, v5.4s\n" - "ldr x23, [%[outptrs], 24]\n" - "fmla v21.4s, v23.4s, v6.4s\n" - "ldr s26, [x22, x27]\n" - "str s0, [x26, x28]\n" - "fmla v1.4s, v27.4s, v4.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "ldr x26, [%[outptrs], 104]\n" - "fmla v16.4s, v27.4s, v7.4s\n" - "add x27, x27, #4\n" - "fmla v21.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v6.4s\n" - "str s1, [x25, x28]\n" - "fmla v20.4s, v27.4s, v9.4s\n" - "fmla v2.4s, v28.4s, v4.4s\n" - "ldr x25, [%[outptrs], 80]\n" - "fmla v15.4s, v28.4s, v7.4s\n" - "fmla v18.4s, v28.4s, v5.4s\n" - "fmla v21.4s, v28.4s, v10.4s\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "fmla v20.4s, v28.4s, v11.4s\n" - "fmla v3.4s, v22.4s, v4.4s\n" - "str s2, [x24, x28]\n" - "fmla v16.4s, v30.4s, v4.4s\n" - "fmla v18.4s, v22.4s, v7.4s\n" - "ldr x24, [%[outptrs], 56]\n" - "fmla v24.4s, v22.4s, v10.4s\n" - "fmla v21.4s, v30.4s, v5.4s\n" - "str s3, [x23, x28]\n" - "fmla v20.4s, v30.4s, v6.4s\n" - "str s16, [x26, x28]\n" - "fmla v15.4s, v19.4s, v4.4s\n" - "fmla v18.4s, v31.4s, v4.4s\n" - "ldr x26, [%[outptrs], 112]\n" - "fmla v21.4s, v19.4s, v7.4s\n" - "fmla v24.4s, v19.4s, v5.4s\n" - "fmla v20.4s, v19.4s, v8.4s\n" - "str s15, [x25, x28]\n" - "str s18, [x24, x28]\n" - "ldr x25, [%[outptrs], 88]\n" - "fmla v24.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v17.4s, v4.4s\n" - "fmla v20.4s, v31.4s, v10.4s\n" - "str s21, [x26, x28]\n" - "fmla v20.4s, v17.4s, v5.4s\n" - "ldr x26, [%[outptrs], 120]\n" - "fmla v24.4s, v14.4s, v4.4s\n" - "fmla v20.4s, v14.4s, v7.4s\n" - "str s24, [x25, x28]\n" - "fmla v20.4s, v26.4s, v4.4s\n" - "str s20, [x26, x28]\n" - "add x28, x28, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x9, %[inptr0], %[input_row_stride]\n" - "add x28, %[input_col_stride1], %[input_col_stride1]\n" - "add x16, %[outptr0], %[output_row_stride]\n" - "add x24, x9, %[input_row_stride]\n" - "add x25, x28, #64\n" - "add x23, x28, %[input_col_stride1]\n" - "add x26, x24, %[input_row_stride]\n" - "add x11, x23, #64\n" - "add x12, x23, %[input_col_stride1]\n" - "add x10, x26, %[input_row_stride]\n" - "add x13, x12, #64\n" - "add x14, x12, %[input_col_stride1]\n" - "add x27, x10, %[input_row_stride]\n" - "add x15, x14, #64\n" - "add x17, x16, %[output_row_stride]\n" - "add x7, x17, %[output_row_stride]\n" - "add x19, %[output_col_stride1], %[output_col_stride1]\n" - "and x21, %[n_channels], #3\n" - "add x20, x19, %[output_col_stride1]\n" - "lsr x22, %[n_channels], #2\n" - "cbz x22, 4f\n" - "1:\n" - "ldr q21, [%[wbptr]]\n" - "subs x22, x22, #1\n" - "mov v7.16b, v21.16b\n" - "ldr q20, [%[wbptr], #16]\n" - "mov v3.16b, v21.16b\n" - "ldr q14, [%[wbptr], #32]\n" - "mov v6.16b, v21.16b\n" - "ldr q13, [%[wbptr], #48]\n" - "mov v15.16b, v21.16b\n" - "ldr q17, [%[wbptr], #64]\n" - "mov v2.16b, v21.16b\n" - "ldr q12, [%[wbptr], #80]\n" - "mov v5.16b, v21.16b\n" - "ldr q11, [%[wbptr], #96]\n" - "mov v0.16b, v21.16b\n" - "ldr q10, [%[wbptr], #112]\n" - "mov v16.16b, v21.16b\n" - "ldr q9, [%[wbptr], #128]\n" - "mov v1.16b, v21.16b\n" - "ldr q8, [%[wbptr], #144]\n" - "mov v4.16b, v21.16b\n" - "ldr q22, [%[inptr0]]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr q19, [x9]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "ldr q18, [x24]\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "ldr q27, [x9, %[input_col_stride1]]\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "ldr q28, [%[inptr0], x28]\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "ldr q25, [x26]\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "ldr q22, [x24, %[input_col_stride1]]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "prfm pldl1keep, [x9, x8]\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "prfm pldl1keep, [x26, #64]\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "beq 3f\n" - "2:\n" - "mov v18.16b, v21.16b\n" - "ldr q23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr q24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr q26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr q25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr q22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr q23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr q24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr q29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr q26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "subs x22, x22, #1\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr q27, [x26, x28]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "ldr q29, [x24, x23]\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "ldr q28, [x9, x12]\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "ldr q21, [%[inptr0], x14]\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr q26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr q21, [x10, x28]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr q29, [x26, x23]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr q21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr q28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr q27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x9, x8]\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "ldr q20, [x10, x23]\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr q26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr q21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #16\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr q28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr q20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr q17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr q26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #16\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr q27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "add x10, x10, #16\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "ldr q28, [x27, x14]\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr q21, [%[wbptr]]\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "add x27, x27, #16\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "ldr q20, [%[wbptr], #16]\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "ldr q14, [%[wbptr], #32]\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "ldr q17, [%[wbptr], #64]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "ldr q13, [%[wbptr], #48]\n" - "str q7, [%[outptr0]]\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "str q6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "ldr q12, [%[wbptr], #80]\n" - "str q5, [%[outptr0], x19]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "ldr q10, [%[wbptr], #112]\n" - "str q4, [%[outptr0], x20]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str q3, [x16]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr q11, [%[wbptr], #96]\n" - "str q2, [x16, %[output_col_stride1]]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "str q1, [x16, x19]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str q22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str q15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str q16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str q19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str q25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str q0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str q18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str q23, [x7, x19]\n" - "mov v7.16b, v21.16b\n" - "str q24, [x7, x20]\n" - "mov v3.16b, v21.16b\n" - "mov v6.16b, v21.16b\n" - "ldr q9, [%[wbptr], #128]\n" - "mov v15.16b, v21.16b\n" - "ldr q8, [%[wbptr], #144]\n" - "mov v2.16b, v21.16b\n" - "ldr q22, [%[inptr0]]\n" - "mov v5.16b, v21.16b\n" - "ldr q19, [x9]\n" - "mov v0.16b, v21.16b\n" - "ldr q23, [%[inptr0], %[input_col_stride1]]\n" - "mov v16.16b, v21.16b\n" - "ldr q18, [x24]\n" - "mov v1.16b, v21.16b\n" - "ldr q27, [x9, %[input_col_stride1]]\n" - "mov v4.16b, v21.16b\n" - "ldr q28, [%[inptr0], x28]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr q25, [x26]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr q22, [x24, %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "add x16, x16, #16\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "add x17, x17, #16\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "add x7, x7, #16\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "bne 2b\n" - "3:\n" - "mov v18.16b, v21.16b\n" - "ldr q23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr q24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr q26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr q25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr q22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr q23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr q24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr q29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr q26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "ldr q27, [x26, x28]\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr q29, [x24, x23]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr q28, [x9, x12]\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "ldr q21, [%[inptr0], x14]\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr q26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr q21, [x10, x28]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "ldr q29, [x26, x23]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr q21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr q28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr q27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #16\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "ldr q20, [x10, x23]\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr q26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr q21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #16\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr q28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr q20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr q17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr q26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #16\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr q27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "add x10, x10, #16\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr q28, [x27, x14]\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "add x27, x27, #16\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str q7, [%[outptr0]]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "str q6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str q5, [%[outptr0], x19]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "str q4, [%[outptr0], x20]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str q3, [x16]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str q2, [x16, %[output_col_stride1]]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str q1, [x16, x19]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "str q22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str q15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str q16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str q19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str q25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str q0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str q18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str q23, [x7, x19]\n" - "add x16, x16, #16\n" - "str q24, [x7, x20]\n" - "add x17, x17, #16\n" - "add x7, x7, #16\n" - "4:\n" - "cbz x21, 7f\n" - "ldr s21, [%[wbptr]]\n" - "mov v7.16b, v21.16b\n" - "ldr s20, [%[wbptr], #4]\n" - "mov v3.16b, v21.16b\n" - "ldr s14, [%[wbptr], #8]\n" - "mov v6.16b, v21.16b\n" - "ldr s13, [%[wbptr], #12]\n" - "mov v15.16b, v21.16b\n" - "ldr s17, [%[wbptr], #16]\n" - "mov v2.16b, v21.16b\n" - "ldr s12, [%[wbptr], #20]\n" - "mov v5.16b, v21.16b\n" - "ldr s11, [%[wbptr], #24]\n" - "mov v0.16b, v21.16b\n" - "ldr s10, [%[wbptr], #28]\n" - "mov v16.16b, v21.16b\n" - "ldr s9, [%[wbptr], #32]\n" - "mov v1.16b, v21.16b\n" - "ldr s8, [%[wbptr], #36]\n" - "mov v4.16b, v21.16b\n" - "ldr s22, [%[inptr0]]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr s19, [x9]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "ldr s18, [x24]\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "ldr s27, [x9, %[input_col_stride1]]\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "ldr s28, [%[inptr0], x28]\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "ldr s25, [x26]\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "ldr s22, [x24, %[input_col_stride1]]\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x9, #64]\n" - "subs x21, x21, #1\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "prfm pldl1keep, [x9, x8]\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "prfm pldl1keep, [x26, #64]\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "beq 6f\n" - "5:\n" - "mov v18.16b, v21.16b\n" - "ldr s23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr s24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr s26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr s25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr s22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr s23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr s24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr s29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr s26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "subs x21, x21, #1\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr s27, [x26, x28]\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "ldr s29, [x24, x23]\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "ldr s28, [x9, x12]\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "ldr s21, [%[inptr0], x14]\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr s26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x8]\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr s21, [x10, x28]\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "ldr s29, [x26, x23]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "prfm pldl1keep, [%[inptr0], x25]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr s21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr s28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr s27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "prfm pldl1keep, [x9, x8]\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "ldr s20, [x10, x23]\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr s26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr s21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #4\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "prfm pldl1keep, [x24, x8]\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr s28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr s20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr s17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr s26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #4\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr s27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "prfm pldl1keep, [x26, #64]\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "add x10, x10, #4\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "ldr s28, [x27, x14]\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr s21, [%[wbptr]]\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "add x27, x27, #4\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "ldr s20, [%[wbptr], #4]\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "ldr s14, [%[wbptr], #8]\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "ldr s17, [%[wbptr], #16]\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "ldr s13, [%[wbptr], #12]\n" - "str s7, [%[outptr0]]\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "str s6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "ldr s12, [%[wbptr], #20]\n" - "str s5, [%[outptr0], x19]\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "ldr s10, [%[wbptr], #28]\n" - "str s4, [%[outptr0], x20]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str s3, [x16]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr s11, [%[wbptr], #24]\n" - "str s2, [x16, %[output_col_stride1]]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "str s1, [x16, x19]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str s22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str s15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str s16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str s19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str s25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str s0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str s18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str s23, [x7, x19]\n" - "mov v7.16b, v21.16b\n" - "str s24, [x7, x20]\n" - "mov v3.16b, v21.16b\n" - "mov v6.16b, v21.16b\n" - "ldr s9, [%[wbptr], #32]\n" - "mov v15.16b, v21.16b\n" - "ldr s8, [%[wbptr], #36]\n" - "mov v2.16b, v21.16b\n" - "ldr s22, [%[inptr0]]\n" - "mov v5.16b, v21.16b\n" - "ldr s19, [x9]\n" - "mov v0.16b, v21.16b\n" - "ldr s23, [%[inptr0], %[input_col_stride1]]\n" - "mov v16.16b, v21.16b\n" - "ldr s18, [x24]\n" - "mov v1.16b, v21.16b\n" - "ldr s27, [x9, %[input_col_stride1]]\n" - "mov v4.16b, v21.16b\n" - "ldr s28, [%[inptr0], x28]\n" - "fmla v7.4s, v22.4s, v20.4s\n" - "ldr s25, [x26]\n" - "fmla v3.4s, v19.4s, v20.4s\n" - "ldr s22, [x24, %[input_col_stride1]]\n" - "fmla v6.4s, v23.4s, v20.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmla v7.4s, v19.4s, v17.4s\n" - "add x16, x16, #4\n" - "fmla v3.4s, v18.4s, v17.4s\n" - "add x17, x17, #4\n" - "fmla v15.4s, v18.4s, v20.4s\n" - "add x7, x7, #4\n" - "fmla v7.4s, v23.4s, v14.4s\n" - "fmla v3.4s, v27.4s, v14.4s\n" - "fmla v7.4s, v18.4s, v10.4s\n" - "fmla v7.4s, v27.4s, v12.4s\n" - "bne 5b\n" - "6:\n" - "mov v18.16b, v21.16b\n" - "ldr s23, [x9, x28]\n" - "mov v19.16b, v21.16b\n" - "prfm pldl1keep, [x9, x25]\n" - "fmla v6.4s, v27.4s, v17.4s\n" - "prfm pldl1keep, [%[inptr0], x11]\n" - "fmla v2.4s, v27.4s, v20.4s\n" - "ldr s24, [%[inptr0], x23]\n" - "fmla v7.4s, v28.4s, v13.4s\n" - "prfm pldl1keep, [x10, #64]\n" - "fmla v6.4s, v28.4s, v14.4s\n" - "prfm pldl1keep, [x26, x8]\n" - "fmla v5.4s, v28.4s, v20.4s\n" - "ldr s26, [x10]\n" - "fmla v3.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x25]\n" - "fmla v15.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x9, x11]\n" - "fmla v0.4s, v25.4s, v20.4s\n" - "ldr s25, [x26, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [%[inptr0], x13]\n" - "fmla v3.4s, v22.4s, v12.4s\n" - "prfm pldl1keep, [x27, #64]\n" - "fmla v6.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [x10, x8]\n" - "fmla v15.4s, v22.4s, v14.4s\n" - "prfm pldl1keep, [x26, x25]\n" - "fmla v2.4s, v22.4s, v17.4s\n" - "prfm pldl1keep, [x24, x11]\n" - "fmla v16.4s, v22.4s, v20.4s\n" - "ldr s22, [x24, x28]\n" - "fmla v7.4s, v23.4s, v11.4s\n" - "prfm pldl1keep, [x9, x13]\n" - "fmla v3.4s, v23.4s, v13.4s\n" - "prfm pldl1keep, [%[inptr0], x15]\n" - "fmla v6.4s, v23.4s, v12.4s\n" - "prfm pldl1keep, [x27, x8]\n" - "fmla v2.4s, v23.4s, v14.4s\n" - "prfm pldl1keep, [x10, x25]\n" - "fmla v5.4s, v23.4s, v17.4s\n" - "prfm pldl1keep, [x26, x11]\n" - "fmla v1.4s, v23.4s, v20.4s\n" - "ldr s23, [x9, x23]\n" - "fmla v6.4s, v24.4s, v13.4s\n" - "prfm pldl1keep, [x24, x13]\n" - "fmla v5.4s, v24.4s, v14.4s\n" - "prfm pldl1keep, [x9, x15]\n" - "fmla v4.4s, v24.4s, v20.4s\n" - "ldr s24, [%[inptr0], x12]\n" - "fmla v15.4s, v26.4s, v10.4s\n" - "prfm pldl1keep, [x27, x25]\n" - "fmla v0.4s, v26.4s, v17.4s\n" - "ldr s29, [x27]\n" - "fmla v3.4s, v25.4s, v9.4s\n" - "prfm pldl1keep, [x10, x11]\n" - "fmla v15.4s, v25.4s, v12.4s\n" - "prfm pldl1keep, [x26, x13]\n" - "fmla v2.4s, v25.4s, v10.4s\n" - "prfm pldl1keep, [x24, x15]\n" - "fmla v0.4s, v25.4s, v14.4s\n" - "prfm pldl1keep, [x27, x11]\n" - "fmla v16.4s, v25.4s, v17.4s\n" - "prfm pldl1keep, [x10, x13]\n" - "fmla v18.4s, v25.4s, v20.4s\n" - "ldr s26, [x10, %[input_col_stride1]]\n" - "fmla v7.4s, v22.4s, v8.4s\n" - "prfm pldl1keep, [x26, x15]\n" - "fmla v3.4s, v22.4s, v11.4s\n" - "prfm pldl1keep, [x27, x13]\n" - "fmla v6.4s, v22.4s, v9.4s\n" - "prfm pldl1keep, [x10, x15]\n" - "fmla v15.4s, v22.4s, v13.4s\n" - "prfm pldl1keep, [x27, x15]\n" - "fmla v2.4s, v22.4s, v12.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v5.4s, v22.4s, v10.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v22.4s, v14.4s\n" - "fmla v1.4s, v22.4s, v17.4s\n" - "fmla v19.4s, v22.4s, v20.4s\n" - "ldr s27, [x26, x28]\n" - "fmla v6.4s, v23.4s, v11.4s\n" - "fmla v2.4s, v23.4s, v13.4s\n" - "fmla v5.4s, v23.4s, v12.4s\n" - "fmla v1.4s, v23.4s, v14.4s\n" - "fmla v4.4s, v23.4s, v17.4s\n" - "fmla v0.4s, v29.4s, v10.4s\n" - "mov v22.16b, v21.16b\n" - "fmla v15.4s, v26.4s, v9.4s\n" - "fmla v5.4s, v24.4s, v13.4s\n" - "fmla v16.4s, v26.4s, v10.4s\n" - "fmla v22.4s, v23.4s, v20.4s\n" - "ldr s29, [x24, x23]\n" - "fmla v4.4s, v24.4s, v14.4s\n" - "ldr s28, [x9, x12]\n" - "fmla v0.4s, v26.4s, v12.4s\n" - "fmla v18.4s, v26.4s, v17.4s\n" - "mov v23.16b, v21.16b\n" - "fmla v3.4s, v27.4s, v8.4s\n" - "fmla v15.4s, v27.4s, v11.4s\n" - "fmla v2.4s, v27.4s, v9.4s\n" - "fmla v0.4s, v27.4s, v13.4s\n" - "fmla v16.4s, v27.4s, v12.4s\n" - "fmla v1.4s, v27.4s, v10.4s\n" - "fmla v18.4s, v27.4s, v14.4s\n" - "fmla v19.4s, v27.4s, v17.4s\n" - "fmla v23.4s, v27.4s, v20.4s\n" - "mov v25.16b, v21.16b\n" - "mov v24.16b, v21.16b\n" - "fmla v6.4s, v29.4s, v8.4s\n" - "fmla v2.4s, v29.4s, v11.4s\n" - "fmla v5.4s, v29.4s, v9.4s\n" - "fmla v16.4s, v29.4s, v13.4s\n" - "fmla v1.4s, v29.4s, v12.4s\n" - "fmla v4.4s, v29.4s, v10.4s\n" - "fmla v19.4s, v29.4s, v14.4s\n" - "fmla v22.4s, v29.4s, v17.4s\n" - "fmla v25.4s, v29.4s, v20.4s\n" - "ldr s21, [%[inptr0], x14]\n" - "fmla v5.4s, v28.4s, v11.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v1.4s, v28.4s, v13.4s\n" - "fmla v4.4s, v28.4s, v12.4s\n" - "fmla v22.4s, v28.4s, v14.4s\n" - "ldr s26, [x27, %[input_col_stride1]]\n" - "fmla v0.4s, v26.4s, v9.4s\n" - "fmla v18.4s, v26.4s, v10.4s\n" - "fmla v4.4s, v21.4s, v13.4s\n" - "ldr s21, [x10, x28]\n" - "fmla v15.4s, v21.4s, v8.4s\n" - "ldr s29, [x26, x23]\n" - "fmla v0.4s, v21.4s, v11.4s\n" - "fmla v16.4s, v21.4s, v9.4s\n" - "fmla v18.4s, v21.4s, v12.4s\n" - "fmla v19.4s, v21.4s, v10.4s\n" - "fmla v23.4s, v21.4s, v17.4s\n" - "ldr s21, [x24, x12]\n" - "fmla v2.4s, v29.4s, v8.4s\n" - "fmla v16.4s, v29.4s, v11.4s\n" - "fmla v1.4s, v29.4s, v9.4s\n" - "fmla v18.4s, v29.4s, v13.4s\n" - "fmla v19.4s, v29.4s, v12.4s\n" - "fmla v22.4s, v29.4s, v10.4s\n" - "fmla v23.4s, v29.4s, v14.4s\n" - "fmla v25.4s, v29.4s, v17.4s\n" - "fmla v24.4s, v29.4s, v20.4s\n" - "ldr s28, [x9, x14]\n" - "fmla v5.4s, v21.4s, v8.4s\n" - "ldr s27, [x27, x28]\n" - "fmla v1.4s, v21.4s, v11.4s\n" - "add x9, x9, #4\n" - "fmla v4.4s, v21.4s, v9.4s\n" - "fmla v19.4s, v21.4s, v13.4s\n" - "fmla v22.4s, v21.4s, v12.4s\n" - "fmla v25.4s, v21.4s, v14.4s\n" - "fmla v0.4s, v27.4s, v8.4s\n" - "ldr s20, [x10, x23]\n" - "fmla v4.4s, v28.4s, v11.4s\n" - "fmla v18.4s, v27.4s, v9.4s\n" - "fmla v22.4s, v28.4s, v13.4s\n" - "ldr s26, [x26, x12]\n" - "fmla v23.4s, v27.4s, v10.4s\n" - "ldr s21, [x24, x14]\n" - "fmla v16.4s, v20.4s, v8.4s\n" - "add x24, x24, #4\n" - "fmla v18.4s, v20.4s, v11.4s\n" - "fmla v19.4s, v20.4s, v9.4s\n" - "fmla v23.4s, v20.4s, v12.4s\n" - "fmla v25.4s, v20.4s, v10.4s\n" - "fmla v24.4s, v20.4s, v17.4s\n" - "ldr s28, [x27, x23]\n" - "fmla v1.4s, v26.4s, v8.4s\n" - "ldr s20, [x10, x12]\n" - "fmla v19.4s, v26.4s, v11.4s\n" - "fmla v22.4s, v26.4s, v9.4s\n" - "fmla v23.4s, v26.4s, v13.4s\n" - "fmla v25.4s, v26.4s, v12.4s\n" - "fmla v24.4s, v26.4s, v14.4s\n" - "ldr s17, [x26, x14]\n" - "fmla v4.4s, v21.4s, v8.4s\n" - "ldr s26, [x27, x12]\n" - "fmla v22.4s, v21.4s, v11.4s\n" - "add x26, x26, #4\n" - "fmla v25.4s, v21.4s, v13.4s\n" - "ldr s27, [x10, x14]\n" - "fmla v18.4s, v28.4s, v8.4s\n" - "add x10, x10, #4\n" - "fmla v23.4s, v28.4s, v9.4s\n" - "fmla v24.4s, v28.4s, v10.4s\n" - "fmla v19.4s, v20.4s, v8.4s\n" - "ldr s28, [x27, x14]\n" - "fmla v25.4s, v20.4s, v9.4s\n" - "add x27, x27, #4\n" - "fmla v23.4s, v20.4s, v11.4s\n" - "fmla v24.4s, v20.4s, v12.4s\n" - "fmla v22.4s, v17.4s, v8.4s\n" - "movi v29.16b, #0\n" - "fmla v25.4s, v17.4s, v11.4s\n" - "fmla v24.4s, v17.4s, v13.4s\n" - "fmla v23.4s, v26.4s, v8.4s\n" - "fmax v7.4s, v7.4s, v29.4s\n" - "fmla v25.4s, v27.4s, v8.4s\n" - "fmax v6.4s, v6.4s, v29.4s\n" - "str s7, [%[outptr0]]\n" - "fmla v24.4s, v26.4s, v9.4s\n" - "str s6, [%[outptr0], %[output_col_stride1]]\n" - "fmax v5.4s, v5.4s, v29.4s\n" - "fmax v4.4s, v4.4s, v29.4s\n" - "fmax v3.4s, v3.4s, v29.4s\n" - "str s5, [%[outptr0], x19]\n" - "fmla v24.4s, v27.4s, v11.4s\n" - "str s4, [%[outptr0], x20]\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "str s3, [x16]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str s2, [x16, %[output_col_stride1]]\n" - "fmla v24.4s, v28.4s, v8.4s\n" - "str s1, [x16, x19]\n" - "fmax v22.4s, v22.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "str s22, [x16, x20]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "str s15, [x17]\n" - "fmax v19.4s, v19.4s, v29.4s\n" - "str s16, [x17, %[output_col_stride1]]\n" - "fmax v25.4s, v25.4s, v29.4s\n" - "str s19, [x17, x19]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str s25, [x17, x20]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "str s0, [x7]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str s18, [x7, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "str s23, [x7, x19]\n" - "add x16, x16, #4\n" - "str s24, [x7, x20]\n" - "add x17, x17, #4\n" - "add x7, x7, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *inptrs[6][6], - float *outptrs[4][4] -) -{ - __asm __volatile( - "mov x27, xzr\n" - "mov x28, xzr\n" - "and x19, %[n_channels], #3\n" - "lsr x26, %[n_channels], #2\n" - "cbz x26, 4f\n" - "1:\n" - "ldr q25, [%[wbptr]]\n" - "ldr x25, [%[inptrs], 0]\n" - "mov v2.16b, v25.16b\n" - "ldr q22, [%[wbptr], #16]\n" - "mov v16.16b, v25.16b\n" - "ldr q9, [%[wbptr], #32]\n" - "mov v18.16b, v25.16b\n" - "ldr q8, [%[wbptr], #48]\n" - "mov v13.16b, v25.16b\n" - "ldr q19, [%[wbptr], #64]\n" - "mov v0.16b, v25.16b\n" - "ldr q7, [%[wbptr], #80]\n" - "mov v17.16b, v25.16b\n" - "ldr q6, [%[wbptr], #96]\n" - "mov v14.16b, v25.16b\n" - "ldr q5, [%[wbptr], #112]\n" - "mov v12.16b, v25.16b\n" - "ldr q4, [%[wbptr], #128]\n" - "mov v15.16b, v25.16b\n" - "ldr q3, [%[wbptr], #144]\n" - "ldr q27, [x25, x27]\n" - "ldr x17, [%[inptrs], 48]\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr x25, [%[inptrs], 8]\n" - "ldr q26, [x17, x27]\n" - "ldr x24, [%[inptrs], 96]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr q31, [x25, x27]\n" - "ldr q28, [x24, x27]\n" - "ldr x17, [%[inptrs], 56]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr q29, [x17, x27]\n" - "ldr x7, [%[inptrs], 144]\n" - "ldr x24, [%[inptrs], 104]\n" - "subs x26, x26, #1\n" - "ldr q30, [x25, x27]\n" - "ldr q27, [x7, x27]\n" - "ldr q21, [x24, x27]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "beq 3f\n" - "2:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr q23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr q26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr q20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "subs x26, x26, #1\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr q23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v11.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr q30, [x24, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "ldr q31, [x17, x27]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr q25, [x25, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 0]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr q26, [x16, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr q28, [x24, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "ldr q25, [x17, x27]\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr q29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x17, [%[inptrs], 48]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "ldr q22, [x15, x27]\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "ldr q26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "ldr x24, [%[inptrs], 96]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr q25, [x16, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr q22, [x7, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr q19, [x16, x27]\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr x7, [%[inptrs], 144]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "add x27, x27, #16\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "ldr q27, [x25, x27]\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 8]\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "movi v29.16b, #0\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "ldr q26, [x17, x27]\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "ldr x17, [%[inptrs], 56]\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "ldr q25, [%[wbptr]]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "ldr q22, [%[wbptr], #16]\n" - "str q2, [x20, x28]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "ldr q9, [%[wbptr], #32]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "ldr q8, [%[wbptr], #48]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr q19, [%[wbptr], #64]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str q18, [x20, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str q16, [x21, x28]\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr q7, [%[wbptr], #80]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "ldr q5, [%[wbptr], #112]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "ldr q6, [%[wbptr], #96]\n" - "str q13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr q4, [%[wbptr], #128]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr q31, [x25, x27]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "ldr q3, [%[wbptr], #144]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr q28, [x24, x27]\n" - "str q14, [x23, x28]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "mov v2.16b, v25.16b\n" - "ldr q29, [x17, x27]\n" - "ldr x20, [%[outptrs], 16]\n" - "ldr x21, [%[outptrs], 40]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr x24, [%[inptrs], 104]\n" - "str q17, [x20, x28]\n" - "mov v16.16b, v25.16b\n" - "str q0, [x21, x28]\n" - "mov v18.16b, v25.16b\n" - "str q12, [x22, x28]\n" - "mov v13.16b, v25.16b\n" - "str q10, [x23, x28]\n" - "mov v0.16b, v25.16b\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr q30, [x25, x27]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "mov v17.16b, v25.16b\n" - "ldr x21, [%[outptrs], 48]\n" - "str q1, [x20, x28]\n" - "mov v14.16b, v25.16b\n" - "str q15, [x21, x28]\n" - "mov v12.16b, v25.16b\n" - "mov v15.16b, v25.16b\n" - "ldr x21, [%[outptrs], 56]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr q27, [x7, x27]\n" - "str q21, [x21, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr q21, [x24, x27]\n" - "ldr x23, [%[outptrs], 112]\n" - "str q11, [x22, x28]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "str q20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str q24, [x22, x28]\n" - "str q23, [x23, x28]\n" - "add x28, x28, #16\n" - "bne 2b\n" - "3:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "mov v11.16b, v25.16b\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr q23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr q26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr q20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr q24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr q23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr q26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "ldr q30, [x24, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr q31, [x17, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "ldr q25, [x25, x27]\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr q26, [x16, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr q29, [x7, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr q28, [x24, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr q25, [x17, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr q29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr q22, [x15, x27]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr q27, [x7, x27]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr q26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr q25, [x16, x27]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr q31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr q22, [x7, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "ldr q19, [x16, x27]\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr q28, [x15, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr q30, [x16, x27]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "add x27, x27, #16\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "movi v29.16b, #0\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str q2, [x20, x28]\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str q18, [x20, x28]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str q16, [x21, x28]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "ldr x20, [%[outptrs], 16]\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr x21, [%[outptrs], 40]\n" - "str q17, [x20, x28]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "str q0, [x21, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str q13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr x21, [%[outptrs], 48]\n" - "str q1, [x20, x28]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "str q15, [x21, x28]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "str q14, [x23, x28]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr x21, [%[outptrs], 56]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str q21, [x21, x28]\n" - "str q12, [x22, x28]\n" - "str q10, [x23, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr x23, [%[outptrs], 112]\n" - "str q11, [x22, x28]\n" - "str q20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str q24, [x22, x28]\n" - "str q23, [x23, x28]\n" - "add x28, x28, #16\n" - "4:\n" - "cbz x19, 7f\n" - "ldr s25, [%[wbptr]]\n" - "mov v2.16b, v25.16b\n" - "ldr s22, [%[wbptr], #4]\n" - "mov v16.16b, v25.16b\n" - "ldr s9, [%[wbptr], #8]\n" - "mov v18.16b, v25.16b\n" - "ldr s8, [%[wbptr], #12]\n" - "mov v13.16b, v25.16b\n" - "ldr s19, [%[wbptr], #16]\n" - "mov v0.16b, v25.16b\n" - "ldr s7, [%[wbptr], #20]\n" - "mov v17.16b, v25.16b\n" - "ldr s6, [%[wbptr], #24]\n" - "mov v14.16b, v25.16b\n" - "ldr s5, [%[wbptr], #28]\n" - "mov v12.16b, v25.16b\n" - "ldr s4, [%[wbptr], #32]\n" - "mov v15.16b, v25.16b\n" - "ldr s3, [%[wbptr], #36]\n" - "ldr x25, [%[inptrs], 0]\n" - "ldr x17, [%[inptrs], 48]\n" - "ldr x24, [%[inptrs], 96]\n" - "ldr x7, [%[inptrs], 144]\n" - "subs x19, x19, #1\n" - "ldr s27, [x25, x27]\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr s28, [x24, x27]\n" - "ldr s27, [x7, x27]\n" - "ldr x25, [%[inptrs], 8]\n" - "ldr x17, [%[inptrs], 56]\n" - "ldr x24, [%[inptrs], 104]\n" - "ldr s31, [x25, x27]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr s29, [x17, x27]\n" - "ldr s21, [x24, x27]\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr s30, [x25, x27]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "beq 6f\n" - "5:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr s23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr s26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr s20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "subs x19, x19, #1\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr s23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v11.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr s30, [x24, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "ldr s31, [x17, x27]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr s25, [x25, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 0]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr s26, [x16, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr s28, [x24, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "ldr s25, [x17, x27]\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr s29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x17, [%[inptrs], 48]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "ldr s22, [x15, x27]\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "ldr s26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "ldr x24, [%[inptrs], 96]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr s25, [x16, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr s22, [x7, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr s19, [x16, x27]\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr x7, [%[inptrs], 144]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "add x27, x27, #4\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "ldr s27, [x25, x27]\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "ldr x25, [%[inptrs], 8]\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "movi v29.16b, #0\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "ldr s26, [x17, x27]\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "ldr x17, [%[inptrs], 56]\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "ldr s25, [%[wbptr]]\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "ldr s22, [%[wbptr], #4]\n" - "str s2, [x20, x28]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "ldr s9, [%[wbptr], #8]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "ldr s8, [%[wbptr], #12]\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "ldr s19, [%[wbptr], #16]\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str s18, [x20, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str s16, [x21, x28]\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr s7, [%[wbptr], #20]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "ldr s5, [%[wbptr], #28]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "ldr s6, [%[wbptr], #24]\n" - "str s13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr s4, [%[wbptr], #32]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr s31, [x25, x27]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "ldr s3, [%[wbptr], #36]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr s28, [x24, x27]\n" - "str s14, [x23, x28]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "mov v2.16b, v25.16b\n" - "ldr s29, [x17, x27]\n" - "ldr x20, [%[outptrs], 16]\n" - "ldr x21, [%[outptrs], 40]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "ldr x25, [%[inptrs], 16]\n" - "ldr x24, [%[inptrs], 104]\n" - "str s17, [x20, x28]\n" - "mov v16.16b, v25.16b\n" - "str s0, [x21, x28]\n" - "mov v18.16b, v25.16b\n" - "str s12, [x22, x28]\n" - "mov v13.16b, v25.16b\n" - "str s10, [x23, x28]\n" - "mov v0.16b, v25.16b\n" - "fmla v2.4s, v27.4s, v22.4s\n" - "ldr s30, [x25, x27]\n" - "fmla v16.4s, v26.4s, v22.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "mov v17.16b, v25.16b\n" - "ldr x21, [%[outptrs], 48]\n" - "str s1, [x20, x28]\n" - "mov v14.16b, v25.16b\n" - "str s15, [x21, x28]\n" - "mov v12.16b, v25.16b\n" - "mov v15.16b, v25.16b\n" - "ldr x21, [%[outptrs], 56]\n" - "fmla v2.4s, v26.4s, v19.4s\n" - "ldr s27, [x7, x27]\n" - "str s21, [x21, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr s21, [x24, x27]\n" - "ldr x23, [%[outptrs], 112]\n" - "str s11, [x22, x28]\n" - "fmla v2.4s, v31.4s, v9.4s\n" - "str s20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str s24, [x22, x28]\n" - "str s23, [x23, x28]\n" - "add x28, x28, #4\n" - "bne 5b\n" - "6:\n" - "mov v1.16b, v25.16b\n" - "ldr x17, [%[inptrs], 64]\n" - "mov v10.16b, v25.16b\n" - "ldr x25, [%[inptrs], 24]\n" - "mov v11.16b, v25.16b\n" - "ldr x15, [%[inptrs], 192]\n" - "fmla v18.4s, v31.4s, v22.4s\n" - "ldr s23, [x17, x27]\n" - "fmla v2.4s, v28.4s, v5.4s\n" - "ldr x7, [%[inptrs], 152]\n" - "fmla v16.4s, v28.4s, v19.4s\n" - "ldr x24, [%[inptrs], 112]\n" - "fmla v13.4s, v28.4s, v22.4s\n" - "ldr s26, [x25, x27]\n" - "fmla v18.4s, v29.4s, v19.4s\n" - "ldr x17, [%[inptrs], 72]\n" - "fmla v2.4s, v29.4s, v7.4s\n" - "ldr x25, [%[inptrs], 32]\n" - "fmla v16.4s, v29.4s, v9.4s\n" - "ldr x16, [%[inptrs], 240]\n" - "fmla v0.4s, v29.4s, v22.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v18.4s, v30.4s, v9.4s\n" - "ldr x15, [%[inptrs], 200]\n" - "fmla v2.4s, v30.4s, v8.4s\n" - "ldr x20, [%[outptrs], 0]\n" - "fmla v17.4s, v30.4s, v22.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v16.4s, v27.4s, v5.4s\n" - "ldr x7, [%[inptrs], 160]\n" - "fmla v13.4s, v27.4s, v19.4s\n" - "ldr x21, [%[outptrs], 32]\n" - "fmla v14.4s, v27.4s, v22.4s\n" - "ldr s20, [x24, x27]\n" - "fmla v2.4s, v21.4s, v4.4s\n" - "ldr x24, [%[inptrs], 120]\n" - "fmla v16.4s, v21.4s, v7.4s\n" - "ldr x22, [%[outptrs], 64]\n" - "fmla v18.4s, v21.4s, v5.4s\n" - "ldr x23, [%[outptrs], 96]\n" - "fmla v13.4s, v21.4s, v9.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v0.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v12.4s, v21.4s, v22.4s\n" - "ldr s24, [x17, x27]\n" - "fmla v2.4s, v23.4s, v6.4s\n" - "ldr x17, [%[inptrs], 80]\n" - "fmla v16.4s, v23.4s, v8.4s\n" - "fmla v18.4s, v23.4s, v7.4s\n" - "fmla v0.4s, v23.4s, v9.4s\n" - "fmla v17.4s, v23.4s, v19.4s\n" - "fmla v15.4s, v23.4s, v22.4s\n" - "ldr s23, [x25, x27]\n" - "fmla v1.4s, v26.4s, v22.4s\n" - "ldr x25, [%[inptrs], 40]\n" - "fmla v18.4s, v26.4s, v8.4s\n" - "fmla v13.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v26.4s, v9.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v14.4s, v28.4s, v19.4s\n" - "ldr s26, [x15, x27]\n" - "fmla v16.4s, v29.4s, v4.4s\n" - "ldr x16, [%[inptrs], 248]\n" - "fmla v13.4s, v29.4s, v7.4s\n" - "ldr x15, [%[inptrs], 208]\n" - "fmla v0.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v19.4s\n" - "fmla v14.4s, v29.4s, v9.4s\n" - "fmla v10.4s, v29.4s, v22.4s\n" - "mov v21.16b, v25.16b\n" - "fmla v2.4s, v20.4s, v3.4s\n" - "fmla v16.4s, v20.4s, v6.4s\n" - "fmla v18.4s, v20.4s, v4.4s\n" - "fmla v13.4s, v20.4s, v8.4s\n" - "fmla v0.4s, v20.4s, v7.4s\n" - "fmla v17.4s, v20.4s, v5.4s\n" - "fmla v12.4s, v20.4s, v9.4s\n" - "fmla v15.4s, v20.4s, v19.4s\n" - "fmla v11.4s, v20.4s, v22.4s\n" - "mov v20.16b, v25.16b\n" - "fmla v18.4s, v24.4s, v6.4s\n" - "fmla v0.4s, v24.4s, v8.4s\n" - "fmla v1.4s, v24.4s, v19.4s\n" - "fmla v17.4s, v24.4s, v7.4s\n" - "fmla v21.4s, v24.4s, v22.4s\n" - "fmla v15.4s, v24.4s, v9.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v14.4s, v30.4s, v5.4s\n" - "ldr s30, [x24, x27]\n" - "fmla v1.4s, v23.4s, v9.4s\n" - "ldr x7, [%[inptrs], 168]\n" - "fmla v17.4s, v23.4s, v8.4s\n" - "ldr s31, [x17, x27]\n" - "fmla v13.4s, v26.4s, v4.4s\n" - "ldr x24, [%[inptrs], 128]\n" - "fmla v14.4s, v26.4s, v7.4s\n" - "ldr x17, [%[inptrs], 88]\n" - "fmla v12.4s, v26.4s, v5.4s\n" - "fmla v10.4s, v26.4s, v19.4s\n" - "mov v24.16b, v25.16b\n" - "mov v23.16b, v25.16b\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v0.4s, v27.4s, v4.4s\n" - "fmla v14.4s, v27.4s, v8.4s\n" - "fmla v12.4s, v27.4s, v7.4s\n" - "fmla v15.4s, v27.4s, v5.4s\n" - "fmla v10.4s, v27.4s, v9.4s\n" - "fmla v11.4s, v27.4s, v19.4s\n" - "fmla v20.4s, v27.4s, v22.4s\n" - "ldr s25, [x25, x27]\n" - "fmla v18.4s, v30.4s, v3.4s\n" - "fmla v0.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v4.4s\n" - "fmla v12.4s, v30.4s, v8.4s\n" - "fmla v15.4s, v30.4s, v7.4s\n" - "fmla v1.4s, v30.4s, v5.4s\n" - "fmla v11.4s, v30.4s, v9.4s\n" - "fmla v21.4s, v30.4s, v19.4s\n" - "fmla v24.4s, v30.4s, v22.4s\n" - "ldr s26, [x16, x27]\n" - "fmla v17.4s, v31.4s, v6.4s\n" - "ldr x16, [%[inptrs], 256]\n" - "fmla v15.4s, v31.4s, v8.4s\n" - "fmla v1.4s, v31.4s, v7.4s\n" - "fmla v21.4s, v31.4s, v9.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "ldr x15, [%[inptrs], 216]\n" - "fmla v10.4s, v26.4s, v5.4s\n" - "ldr s29, [x7, x27]\n" - "fmla v1.4s, v25.4s, v8.4s\n" - "ldr s28, [x24, x27]\n" - "fmla v13.4s, v31.4s, v3.4s\n" - "ldr x7, [%[inptrs], 176]\n" - "fmla v14.4s, v31.4s, v6.4s\n" - "ldr x24, [%[inptrs], 136]\n" - "fmla v12.4s, v31.4s, v4.4s\n" - "fmla v10.4s, v31.4s, v7.4s\n" - "fmla v11.4s, v31.4s, v5.4s\n" - "fmla v20.4s, v31.4s, v19.4s\n" - "fmla v0.4s, v29.4s, v3.4s\n" - "ldr s25, [x17, x27]\n" - "fmla v15.4s, v29.4s, v4.4s\n" - "fmla v21.4s, v29.4s, v5.4s\n" - "fmla v12.4s, v29.4s, v6.4s\n" - "fmla v10.4s, v29.4s, v8.4s\n" - "fmla v11.4s, v29.4s, v7.4s\n" - "fmla v20.4s, v29.4s, v9.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v23.4s, v29.4s, v22.4s\n" - "fmla v17.4s, v28.4s, v3.4s\n" - "ldr s29, [x16, x27]\n" - "fmla v15.4s, v28.4s, v6.4s\n" - "ldr s22, [x15, x27]\n" - "fmla v1.4s, v28.4s, v4.4s\n" - "ldr x16, [%[inptrs], 264]\n" - "fmla v11.4s, v28.4s, v8.4s\n" - "ldr x15, [%[inptrs], 224]\n" - "fmla v21.4s, v28.4s, v7.4s\n" - "fmla v24.4s, v28.4s, v9.4s\n" - "fmla v14.4s, v29.4s, v3.4s\n" - "ldr s27, [x7, x27]\n" - "fmla v1.4s, v25.4s, v6.4s\n" - "ldr x7, [%[inptrs], 184]\n" - "fmla v10.4s, v29.4s, v4.4s\n" - "fmla v20.4s, v29.4s, v5.4s\n" - "fmla v21.4s, v25.4s, v8.4s\n" - "ldr s26, [x24, x27]\n" - "fmla v12.4s, v22.4s, v3.4s\n" - "ldr s25, [x16, x27]\n" - "fmla v11.4s, v22.4s, v4.4s\n" - "ldr x16, [%[inptrs], 272]\n" - "fmla v10.4s, v22.4s, v6.4s\n" - "fmla v20.4s, v22.4s, v7.4s\n" - "fmla v24.4s, v22.4s, v5.4s\n" - "fmla v23.4s, v22.4s, v19.4s\n" - "fmla v15.4s, v27.4s, v3.4s\n" - "ldr s31, [x15, x27]\n" - "fmla v11.4s, v27.4s, v6.4s\n" - "ldr s22, [x7, x27]\n" - "fmla v21.4s, v27.4s, v4.4s\n" - "ldr x15, [%[inptrs], 232]\n" - "fmla v20.4s, v27.4s, v8.4s\n" - "fmla v24.4s, v27.4s, v7.4s\n" - "fmla v23.4s, v27.4s, v9.4s\n" - "ldr s19, [x16, x27]\n" - "fmla v1.4s, v26.4s, v3.4s\n" - "ldr s28, [x15, x27]\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr x16, [%[inptrs], 280]\n" - "fmla v24.4s, v26.4s, v8.4s\n" - "fmla v10.4s, v25.4s, v3.4s\n" - "fmla v20.4s, v25.4s, v4.4s\n" - "ldr s30, [x16, x27]\n" - "fmla v23.4s, v25.4s, v5.4s\n" - "add x27, x27, #4\n" - "fmla v11.4s, v31.4s, v3.4s\n" - "fmla v21.4s, v22.4s, v3.4s\n" - "fmla v24.4s, v31.4s, v4.4s\n" - "movi v29.16b, #0\n" - "fmla v20.4s, v31.4s, v6.4s\n" - "fmla v23.4s, v31.4s, v7.4s\n" - "fmax v2.4s, v2.4s, v29.4s\n" - "fmax v18.4s, v18.4s, v29.4s\n" - "fmla v24.4s, v22.4s, v6.4s\n" - "fmax v17.4s, v17.4s, v29.4s\n" - "fmla v20.4s, v19.4s, v3.4s\n" - "fmax v1.4s, v1.4s, v29.4s\n" - "str s2, [x20, x28]\n" - "fmla v23.4s, v22.4s, v8.4s\n" - "fmax v16.4s, v16.4s, v29.4s\n" - "ldr x20, [%[outptrs], 8]\n" - "fmla v24.4s, v28.4s, v3.4s\n" - "fmax v0.4s, v0.4s, v29.4s\n" - "str s18, [x20, x28]\n" - "fmax v15.4s, v15.4s, v29.4s\n" - "str s16, [x21, x28]\n" - "fmla v23.4s, v19.4s, v4.4s\n" - "fmax v21.4s, v21.4s, v29.4s\n" - "ldr x20, [%[outptrs], 16]\n" - "fmax v13.4s, v13.4s, v29.4s\n" - "ldr x21, [%[outptrs], 40]\n" - "str s17, [x20, x28]\n" - "fmax v12.4s, v12.4s, v29.4s\n" - "str s0, [x21, x28]\n" - "fmla v23.4s, v28.4s, v6.4s\n" - "str s13, [x22, x28]\n" - "fmax v11.4s, v11.4s, v29.4s\n" - "fmax v24.4s, v24.4s, v29.4s\n" - "ldr x20, [%[outptrs], 24]\n" - "fmax v14.4s, v14.4s, v29.4s\n" - "ldr x21, [%[outptrs], 48]\n" - "str s1, [x20, x28]\n" - "fmla v23.4s, v30.4s, v3.4s\n" - "str s15, [x21, x28]\n" - "fmax v10.4s, v10.4s, v29.4s\n" - "str s14, [x23, x28]\n" - "fmax v20.4s, v20.4s, v29.4s\n" - "ldr x21, [%[outptrs], 56]\n" - "ldr x22, [%[outptrs], 72]\n" - "ldr x23, [%[outptrs], 104]\n" - "fmax v23.4s, v23.4s, v29.4s\n" - "str s21, [x21, x28]\n" - "str s12, [x22, x28]\n" - "str s10, [x23, x28]\n" - "ldr x22, [%[outptrs], 80]\n" - "ldr x23, [%[outptrs], 112]\n" - "str s11, [x22, x28]\n" - "str s20, [x23, x28]\n" - "ldr x22, [%[outptrs], 88]\n" - "ldr x23, [%[outptrs], 120]\n" - "str s24, [x22, x28]\n" - "str s23, [x23, x28]\n" - "add x28, x28, #4\n" - "7:\n" - : [wbptr] "+r" (weight_bias_ptr) - : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" - ); -} - -template <> -template <> -void Conv::execute_tile( - int n_channels, - const void *weight_bias_ptr, - const float *input, - const unsigned int input_row_stride, - const unsigned int input_col_stride, - float *output, - const unsigned int output_row_stride, - const unsigned int output_col_stride -) -{ - __asm __volatile( - "add x24, %[inptr0], %[input_row_stride]\n" - "add x13, %[input_col_stride1], %[input_col_stride1]\n" - "add x8, %[outptr0], %[output_row_stride]\n" - "add x9, x24, %[input_row_stride]\n" - "add x10, x13, #64\n" - "add x19, x13, %[input_col_stride1]\n" - "add x20, x9, %[input_row_stride]\n" - "add x21, x19, #64\n" - "add x17, x19, %[input_col_stride1]\n" - "add x22, x20, %[input_row_stride]\n" - "add x7, x17, #64\n" - "add x11, x17, %[input_col_stride1]\n" - "add x23, x22, %[input_row_stride]\n" - "add x12, x11, #64\n" - "add x25, x8, %[output_row_stride]\n" - "add x26, x25, %[output_row_stride]\n" - "add x27, %[output_col_stride1], %[output_col_stride1]\n" - "and x14, %[n_channels], #3\n" - "add x28, x27, %[output_col_stride1]\n" - "lsr x15, %[n_channels], #2\n" - "cbz x15, 4f\n" - "1:\n" - "ldr q23, [%[wbptr]]\n" - "subs x15, x15, #1\n" - "mov v12.16b, v23.16b\n" - "ldr q20, [%[wbptr], #16]\n" - "mov v8.16b, v23.16b\n" - "ldr q6, [%[wbptr], #32]\n" - "mov v11.16b, v23.16b\n" - "ldr q5, [%[wbptr], #48]\n" - "mov v16.16b, v23.16b\n" - "ldr q19, [%[wbptr], #64]\n" - "mov v7.16b, v23.16b\n" - "ldr q4, [%[wbptr], #80]\n" - "mov v10.16b, v23.16b\n" - "ldr q3, [%[wbptr], #96]\n" - "mov v14.16b, v23.16b\n" - "ldr q2, [%[wbptr], #112]\n" - "mov v15.16b, v23.16b\n" - "ldr q1, [%[wbptr], #128]\n" - "mov v17.16b, v23.16b\n" - "ldr q0, [%[wbptr], #144]\n" - "mov v9.16b, v23.16b\n" - "ldr q28, [%[inptr0]]\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "ldr q25, [x24]\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "ldr q18, [%[inptr0], %[input_col_stride1]]\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "ldr q30, [x9]\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "ldr q29, [x24, %[input_col_stride1]]\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "ldr q24, [%[inptr0], x13]\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "ldr q27, [x20]\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "ldr q22, [x9, %[input_col_stride1]]\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x24, #64]\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "prfm pldl1keep, [x24, x16]\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "prfm pldl1keep, [x20, #64]\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "beq 3f\n" - "2:\n" - "mov v13.16b, v23.16b\n" - "ldr q21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr q25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr q24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr q26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr q30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr q22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr q21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr q24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr q26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "subs x15, x15, #1\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr q27, [x20, x13]\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "ldr q28, [x9, x19]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "ldr q29, [x24, x17]\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "ldr q23, [%[inptr0], x11]\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr q30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr q23, [x22, x13]\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "ldr q29, [x20, x19]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr q23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr q26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #16\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "prfm pldl1keep, [x24, x16]\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "ldr q20, [x22, x19]\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr q26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr q23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #16\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr q27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr q20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr q19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #16\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr q29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "add x22, x22, #16\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "ldr q30, [x23, x11]\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr q23, [%[wbptr]]\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "add x23, x23, #16\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "ldr q20, [%[wbptr], #16]\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "ldr q6, [%[wbptr], #32]\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "ldr q19, [%[wbptr], #64]\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "ldr q5, [%[wbptr], #48]\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "ldr q4, [%[wbptr], #80]\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "ldr q2, [%[wbptr], #112]\n" - "fmov v27.4s, #6.0\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "ldr q3, [%[wbptr], #96]\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "ldr q1, [%[wbptr], #128]\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str q12, [%[outptr0]]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "str q10, [%[outptr0], x27]\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "str q9, [%[outptr0], x28]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "str q8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str q7, [x8, %[output_col_stride1]]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str q17, [x8, x27]\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "ldr q0, [%[wbptr], #144]\n" - "str q25, [x8, x28]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str q16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "ldr q28, [%[inptr0]]\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "ldr q25, [x24]\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "str q18, [x25, x27]\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "ldr q18, [%[inptr0], %[input_col_stride1]]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "ldr q30, [x9]\n" - "str q24, [x25, x28]\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str q14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "ldr q29, [x24, %[input_col_stride1]]\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "ldr q24, [%[inptr0], x13]\n" - "str q13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "str q22, [x26, x27]\n" - "mov v12.16b, v23.16b\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "ldr q27, [x20]\n" - "mov v8.16b, v23.16b\n" - "ldr q22, [x9, %[input_col_stride1]]\n" - "str q21, [x26, x28]\n" - "mov v11.16b, v23.16b\n" - "mov v16.16b, v23.16b\n" - "add %[outptr0], %[outptr0], #16\n" - "mov v7.16b, v23.16b\n" - "add x8, x8, #16\n" - "mov v10.16b, v23.16b\n" - "add x25, x25, #16\n" - "mov v14.16b, v23.16b\n" - "add x26, x26, #16\n" - "mov v15.16b, v23.16b\n" - "mov v17.16b, v23.16b\n" - "mov v9.16b, v23.16b\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "bne 2b\n" - "3:\n" - "mov v13.16b, v23.16b\n" - "ldr q21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr q25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr q24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr q26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr q30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr q22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr q21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr q24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr q26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #160\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "ldr q27, [x20, x13]\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr q28, [x9, x19]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "ldr q29, [x24, x17]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "ldr q23, [%[inptr0], x11]\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #16\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr q30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr q23, [x22, x13]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "ldr q29, [x20, x19]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr q23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr q26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #16\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "ldr q20, [x22, x19]\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr q26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr q23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #16\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr q27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr q20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr q19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr q28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #16\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr q29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "add x22, x22, #16\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr q30, [x23, x11]\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "add x23, x23, #16\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "fmov v27.4s, #6.0\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str q12, [%[outptr0]]\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "str q11, [%[outptr0], %[output_col_stride1]]\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "str q10, [%[outptr0], x27]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "str q9, [%[outptr0], x28]\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "add %[outptr0], %[outptr0], #16\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "str q8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str q7, [x8, %[output_col_stride1]]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str q17, [x8, x27]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str q25, [x8, x28]\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "add x8, x8, #16\n" - "str q16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "str q15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str q18, [x25, x27]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "str q24, [x25, x28]\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "str q14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "str q13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "add x25, x25, #16\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "str q22, [x26, x27]\n" - "str q21, [x26, x28]\n" - "add x26, x26, #16\n" - "4:\n" - "cbz x14, 7f\n" - "ldr s23, [%[wbptr]]\n" - "mov v12.16b, v23.16b\n" - "ldr s20, [%[wbptr], #4]\n" - "mov v8.16b, v23.16b\n" - "ldr s6, [%[wbptr], #8]\n" - "mov v11.16b, v23.16b\n" - "ldr s5, [%[wbptr], #12]\n" - "mov v16.16b, v23.16b\n" - "ldr s19, [%[wbptr], #16]\n" - "mov v7.16b, v23.16b\n" - "ldr s4, [%[wbptr], #20]\n" - "mov v10.16b, v23.16b\n" - "ldr s3, [%[wbptr], #24]\n" - "mov v14.16b, v23.16b\n" - "ldr s2, [%[wbptr], #28]\n" - "mov v15.16b, v23.16b\n" - "ldr s1, [%[wbptr], #32]\n" - "mov v17.16b, v23.16b\n" - "ldr s0, [%[wbptr], #36]\n" - "mov v9.16b, v23.16b\n" - "ldr s28, [%[inptr0]]\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "ldr s25, [x24]\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "ldr s18, [%[inptr0], %[input_col_stride1]]\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "ldr s30, [x9]\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "ldr s29, [x24, %[input_col_stride1]]\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "ldr s24, [%[inptr0], x13]\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "ldr s27, [x20]\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "ldr s22, [x9, %[input_col_stride1]]\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "prfm pldl1keep, [x24, #64]\n" - "subs x14, x14, #1\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [x24, x16]\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "prfm pldl1keep, [x20, #64]\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "beq 6f\n" - "5:\n" - "mov v13.16b, v23.16b\n" - "ldr s21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr s25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr s24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr s26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr s30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr s22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr s21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr s24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr s26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "subs x14, x14, #1\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr s27, [x20, x13]\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "ldr s28, [x9, x19]\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "ldr s29, [x24, x17]\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "ldr s23, [%[inptr0], x11]\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "prfm pldl1keep, [%[inptr0], #64]\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr s30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x16]\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr s23, [x22, x13]\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "ldr s29, [x20, x19]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "prfm pldl1keep, [%[inptr0], x10]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr s23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr s26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #4\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "prfm pldl1keep, [x24, #64]\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "prfm pldl1keep, [x24, x16]\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "ldr s20, [x22, x19]\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr s26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr s23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #4\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "prfm pldl1keep, [x9, #64]\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "prfm pldl1keep, [x9, x16]\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr s27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr s20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr s19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #4\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr s29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "prfm pldl1keep, [x20, #64]\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "add x22, x22, #4\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "ldr s30, [x23, x11]\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr s23, [%[wbptr]]\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "add x23, x23, #4\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "ldr s20, [%[wbptr], #4]\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "ldr s6, [%[wbptr], #8]\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "ldr s19, [%[wbptr], #16]\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "ldr s5, [%[wbptr], #12]\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "ldr s4, [%[wbptr], #20]\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "ldr s2, [%[wbptr], #28]\n" - "fmov v27.4s, #6.0\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "ldr s3, [%[wbptr], #24]\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "ldr s1, [%[wbptr], #32]\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str s12, [%[outptr0]]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "str s10, [%[outptr0], x27]\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "str s9, [%[outptr0], x28]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "str s8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str s7, [x8, %[output_col_stride1]]\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str s17, [x8, x27]\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "ldr s0, [%[wbptr], #36]\n" - "str s25, [x8, x28]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str s16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "ldr s28, [%[inptr0]]\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "ldr s25, [x24]\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "str s18, [x25, x27]\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "ldr s18, [%[inptr0], %[input_col_stride1]]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "ldr s30, [x9]\n" - "str s24, [x25, x28]\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str s14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "ldr s29, [x24, %[input_col_stride1]]\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "ldr s24, [%[inptr0], x13]\n" - "str s13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "str s22, [x26, x27]\n" - "mov v12.16b, v23.16b\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "ldr s27, [x20]\n" - "mov v8.16b, v23.16b\n" - "ldr s22, [x9, %[input_col_stride1]]\n" - "str s21, [x26, x28]\n" - "mov v11.16b, v23.16b\n" - "mov v16.16b, v23.16b\n" - "add %[outptr0], %[outptr0], #4\n" - "mov v7.16b, v23.16b\n" - "add x8, x8, #4\n" - "mov v10.16b, v23.16b\n" - "add x25, x25, #4\n" - "mov v14.16b, v23.16b\n" - "add x26, x26, #4\n" - "mov v15.16b, v23.16b\n" - "mov v17.16b, v23.16b\n" - "mov v9.16b, v23.16b\n" - "fmla v12.4s, v28.4s, v20.4s\n" - "fmla v8.4s, v25.4s, v20.4s\n" - "fmla v11.4s, v18.4s, v20.4s\n" - "fmla v16.4s, v30.4s, v20.4s\n" - "fmla v12.4s, v25.4s, v19.4s\n" - "fmla v8.4s, v30.4s, v19.4s\n" - "fmla v12.4s, v18.4s, v6.4s\n" - "fmla v8.4s, v29.4s, v6.4s\n" - "fmla v12.4s, v30.4s, v2.4s\n" - "fmla v12.4s, v29.4s, v4.4s\n" - "bne 5b\n" - "6:\n" - "mov v13.16b, v23.16b\n" - "ldr s21, [x24, x13]\n" - "mov v18.16b, v23.16b\n" - "prfm pldl1keep, [x24, x10]\n" - "fmla v11.4s, v29.4s, v19.4s\n" - "prfm pldl1keep, [%[inptr0], x21]\n" - "fmla v7.4s, v29.4s, v20.4s\n" - "ldr s25, [%[inptr0], x19]\n" - "fmla v12.4s, v24.4s, v5.4s\n" - "prfm pldl1keep, [x22, #64]\n" - "fmla v11.4s, v24.4s, v6.4s\n" - "prfm pldl1keep, [x20, x16]\n" - "fmla v10.4s, v24.4s, v20.4s\n" - "ldr s24, [x22]\n" - "fmla v8.4s, v27.4s, v2.4s\n" - "prfm pldl1keep, [x9, x10]\n" - "fmla v16.4s, v27.4s, v19.4s\n" - "prfm pldl1keep, [x24, x21]\n" - "fmla v14.4s, v27.4s, v20.4s\n" - "ldr s26, [x20, %[input_col_stride1]]\n" - "fmla v12.4s, v22.4s, v1.4s\n" - "prfm pldl1keep, [%[inptr0], x7]\n" - "fmla v8.4s, v22.4s, v4.4s\n" - "prfm pldl1keep, [x23, #64]\n" - "fmla v11.4s, v22.4s, v2.4s\n" - "prfm pldl1keep, [x22, x16]\n" - "fmla v16.4s, v22.4s, v6.4s\n" - "prfm pldl1keep, [x20, x10]\n" - "fmla v7.4s, v22.4s, v19.4s\n" - "prfm pldl1keep, [x9, x21]\n" - "fmla v15.4s, v22.4s, v20.4s\n" - "ldr s30, [x9, x13]\n" - "fmla v12.4s, v21.4s, v3.4s\n" - "prfm pldl1keep, [x24, x7]\n" - "fmla v8.4s, v21.4s, v5.4s\n" - "prfm pldl1keep, [%[inptr0], x12]\n" - "fmla v11.4s, v21.4s, v4.4s\n" - "prfm pldl1keep, [x23, x16]\n" - "fmla v7.4s, v21.4s, v6.4s\n" - "prfm pldl1keep, [x22, x10]\n" - "fmla v10.4s, v21.4s, v19.4s\n" - "prfm pldl1keep, [x20, x21]\n" - "fmla v17.4s, v21.4s, v20.4s\n" - "ldr s22, [x24, x19]\n" - "fmla v11.4s, v25.4s, v5.4s\n" - "prfm pldl1keep, [x9, x7]\n" - "fmla v10.4s, v25.4s, v6.4s\n" - "prfm pldl1keep, [x24, x12]\n" - "fmla v9.4s, v25.4s, v20.4s\n" - "ldr s21, [%[inptr0], x17]\n" - "fmla v16.4s, v24.4s, v2.4s\n" - "prfm pldl1keep, [x23, x10]\n" - "fmla v14.4s, v24.4s, v19.4s\n" - "ldr s24, [x23]\n" - "fmla v8.4s, v26.4s, v1.4s\n" - "prfm pldl1keep, [x22, x21]\n" - "fmla v16.4s, v26.4s, v4.4s\n" - "prfm pldl1keep, [x20, x7]\n" - "fmla v7.4s, v26.4s, v2.4s\n" - "prfm pldl1keep, [x9, x12]\n" - "fmla v14.4s, v26.4s, v6.4s\n" - "prfm pldl1keep, [x23, x21]\n" - "fmla v15.4s, v26.4s, v19.4s\n" - "prfm pldl1keep, [x22, x7]\n" - "fmla v13.4s, v26.4s, v20.4s\n" - "ldr s26, [x22, %[input_col_stride1]]\n" - "fmla v12.4s, v30.4s, v0.4s\n" - "prfm pldl1keep, [x20, x12]\n" - "fmla v8.4s, v30.4s, v3.4s\n" - "prfm pldl1keep, [x23, x7]\n" - "fmla v11.4s, v30.4s, v1.4s\n" - "prfm pldl1keep, [x22, x12]\n" - "fmla v16.4s, v30.4s, v5.4s\n" - "prfm pldl1keep, [x23, x12]\n" - "fmla v7.4s, v30.4s, v4.4s\n" - "add %[wbptr], %[wbptr], #40\n" - "fmla v10.4s, v30.4s, v2.4s\n" - "prfm pldl1keep, [%[wbptr], #64]\n" - "fmla v15.4s, v30.4s, v6.4s\n" - "fmla v17.4s, v30.4s, v19.4s\n" - "fmla v18.4s, v30.4s, v20.4s\n" - "ldr s27, [x20, x13]\n" - "fmla v11.4s, v22.4s, v3.4s\n" - "fmla v7.4s, v22.4s, v5.4s\n" - "fmla v10.4s, v22.4s, v4.4s\n" - "fmla v17.4s, v22.4s, v6.4s\n" - "fmla v9.4s, v22.4s, v19.4s\n" - "fmla v14.4s, v24.4s, v2.4s\n" - "mov v25.16b, v23.16b\n" - "fmla v16.4s, v26.4s, v1.4s\n" - "fmla v10.4s, v21.4s, v5.4s\n" - "fmla v15.4s, v26.4s, v2.4s\n" - "fmla v25.4s, v22.4s, v20.4s\n" - "ldr s28, [x9, x19]\n" - "fmla v9.4s, v21.4s, v6.4s\n" - "ldr s29, [x24, x17]\n" - "fmla v14.4s, v26.4s, v4.4s\n" - "fmla v13.4s, v26.4s, v19.4s\n" - "mov v22.16b, v23.16b\n" - "fmla v8.4s, v27.4s, v0.4s\n" - "fmla v16.4s, v27.4s, v3.4s\n" - "fmla v7.4s, v27.4s, v1.4s\n" - "fmla v14.4s, v27.4s, v5.4s\n" - "fmla v15.4s, v27.4s, v4.4s\n" - "fmla v17.4s, v27.4s, v2.4s\n" - "fmla v13.4s, v27.4s, v6.4s\n" - "fmla v18.4s, v27.4s, v19.4s\n" - "fmla v22.4s, v27.4s, v20.4s\n" - "mov v24.16b, v23.16b\n" - "mov v21.16b, v23.16b\n" - "fmla v11.4s, v28.4s, v0.4s\n" - "fmla v7.4s, v28.4s, v3.4s\n" - "fmla v10.4s, v28.4s, v1.4s\n" - "fmla v15.4s, v28.4s, v5.4s\n" - "fmla v17.4s, v28.4s, v4.4s\n" - "fmla v9.4s, v28.4s, v2.4s\n" - "fmla v18.4s, v28.4s, v6.4s\n" - "fmla v25.4s, v28.4s, v19.4s\n" - "fmla v24.4s, v28.4s, v20.4s\n" - "ldr s23, [%[inptr0], x11]\n" - "fmla v10.4s, v29.4s, v3.4s\n" - "add %[inptr0], %[inptr0], #4\n" - "fmla v17.4s, v29.4s, v5.4s\n" - "fmla v9.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v6.4s\n" - "ldr s30, [x23, %[input_col_stride1]]\n" - "fmla v14.4s, v30.4s, v1.4s\n" - "fmla v13.4s, v30.4s, v2.4s\n" - "fmla v9.4s, v23.4s, v5.4s\n" - "ldr s23, [x22, x13]\n" - "fmla v16.4s, v23.4s, v0.4s\n" - "ldr s29, [x20, x19]\n" - "fmla v14.4s, v23.4s, v3.4s\n" - "fmla v15.4s, v23.4s, v1.4s\n" - "fmla v13.4s, v23.4s, v4.4s\n" - "fmla v18.4s, v23.4s, v2.4s\n" - "fmla v22.4s, v23.4s, v19.4s\n" - "ldr s23, [x9, x17]\n" - "fmla v7.4s, v29.4s, v0.4s\n" - "fmla v15.4s, v29.4s, v3.4s\n" - "fmla v17.4s, v29.4s, v1.4s\n" - "fmla v13.4s, v29.4s, v5.4s\n" - "fmla v18.4s, v29.4s, v4.4s\n" - "fmla v25.4s, v29.4s, v2.4s\n" - "fmla v22.4s, v29.4s, v6.4s\n" - "fmla v24.4s, v29.4s, v19.4s\n" - "fmla v21.4s, v29.4s, v20.4s\n" - "ldr s26, [x24, x11]\n" - "fmla v10.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x13]\n" - "fmla v17.4s, v23.4s, v3.4s\n" - "add x24, x24, #4\n" - "fmla v9.4s, v23.4s, v1.4s\n" - "fmla v18.4s, v23.4s, v5.4s\n" - "fmla v25.4s, v23.4s, v4.4s\n" - "fmla v24.4s, v23.4s, v6.4s\n" - "fmla v14.4s, v28.4s, v0.4s\n" - "ldr s20, [x22, x19]\n" - "fmla v9.4s, v26.4s, v3.4s\n" - "fmla v13.4s, v28.4s, v1.4s\n" - "fmla v25.4s, v26.4s, v5.4s\n" - "ldr s26, [x20, x17]\n" - "fmla v22.4s, v28.4s, v2.4s\n" - "ldr s23, [x9, x11]\n" - "fmla v15.4s, v20.4s, v0.4s\n" - "add x9, x9, #4\n" - "fmla v13.4s, v20.4s, v3.4s\n" - "fmla v18.4s, v20.4s, v1.4s\n" - "fmla v22.4s, v20.4s, v4.4s\n" - "fmla v24.4s, v20.4s, v2.4s\n" - "fmla v21.4s, v20.4s, v19.4s\n" - "ldr s27, [x23, x19]\n" - "fmla v17.4s, v26.4s, v0.4s\n" - "ldr s20, [x22, x17]\n" - "fmla v18.4s, v26.4s, v3.4s\n" - "fmla v25.4s, v26.4s, v1.4s\n" - "fmla v22.4s, v26.4s, v5.4s\n" - "fmla v24.4s, v26.4s, v4.4s\n" - "fmla v21.4s, v26.4s, v6.4s\n" - "ldr s19, [x20, x11]\n" - "fmla v9.4s, v23.4s, v0.4s\n" - "ldr s28, [x23, x17]\n" - "fmla v25.4s, v23.4s, v3.4s\n" - "add x20, x20, #4\n" - "fmla v24.4s, v23.4s, v5.4s\n" - "ldr s29, [x22, x11]\n" - "fmla v13.4s, v27.4s, v0.4s\n" - "add x22, x22, #4\n" - "fmla v22.4s, v27.4s, v1.4s\n" - "fmla v21.4s, v27.4s, v2.4s\n" - "fmla v18.4s, v20.4s, v0.4s\n" - "ldr s30, [x23, x11]\n" - "fmla v24.4s, v20.4s, v1.4s\n" - "add x23, x23, #4\n" - "fmla v22.4s, v20.4s, v3.4s\n" - "fmla v21.4s, v20.4s, v4.4s\n" - "fmla v25.4s, v19.4s, v0.4s\n" - "movi v26.16b, #0\n" - "fmla v24.4s, v19.4s, v3.4s\n" - "fmov v27.4s, #6.0\n" - "fmla v21.4s, v19.4s, v5.4s\n" - "fmla v22.4s, v28.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v26.4s\n" - "fmax v11.4s, v11.4s, v26.4s\n" - "fmla v24.4s, v29.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v26.4s\n" - "fmla v21.4s, v28.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v27.4s\n" - "fmin v11.4s, v11.4s, v27.4s\n" - "fmin v10.4s, v10.4s, v27.4s\n" - "str s12, [%[outptr0]]\n" - "fmax v9.4s, v9.4s, v26.4s\n" - "str s11, [%[outptr0], %[output_col_stride1]]\n" - "fmla v21.4s, v29.4s, v3.4s\n" - "str s10, [%[outptr0], x27]\n" - "fmin v9.4s, v9.4s, v27.4s\n" - "fmax v8.4s, v8.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v26.4s\n" - "str s9, [%[outptr0], x28]\n" - "fmla v21.4s, v30.4s, v0.4s\n" - "fmin v8.4s, v8.4s, v27.4s\n" - "add %[outptr0], %[outptr0], #4\n" - "fmin v7.4s, v7.4s, v27.4s\n" - "fmax v17.4s, v17.4s, v26.4s\n" - "str s8, [x8]\n" - "fmax v25.4s, v25.4s, v26.4s\n" - "str s7, [x8, %[output_col_stride1]]\n" - "fmin v17.4s, v17.4s, v27.4s\n" - "fmin v25.4s, v25.4s, v27.4s\n" - "fmax v16.4s, v16.4s, v26.4s\n" - "str s17, [x8, x27]\n" - "fmax v15.4s, v15.4s, v26.4s\n" - "str s25, [x8, x28]\n" - "fmin v16.4s, v16.4s, v27.4s\n" - "fmin v15.4s, v15.4s, v27.4s\n" - "add x8, x8, #4\n" - "str s16, [x25]\n" - "fmax v18.4s, v18.4s, v26.4s\n" - "str s15, [x25, %[output_col_stride1]]\n" - "fmax v24.4s, v24.4s, v26.4s\n" - "fmin v18.4s, v18.4s, v27.4s\n" - "fmax v14.4s, v14.4s, v26.4s\n" - "fmin v24.4s, v24.4s, v27.4s\n" - "fmax v13.4s, v13.4s, v26.4s\n" - "str s18, [x25, x27]\n" - "fmin v14.4s, v14.4s, v27.4s\n" - "str s24, [x25, x28]\n" - "fmin v13.4s, v13.4s, v27.4s\n" - "str s14, [x26]\n" - "fmax v22.4s, v22.4s, v26.4s\n" - "str s13, [x26, %[output_col_stride1]]\n" - "fmax v21.4s, v21.4s, v26.4s\n" - "fmin v22.4s, v22.4s, v27.4s\n" - "add x25, x25, #4\n" - "fmin v21.4s, v21.4s, v27.4s\n" - "str s22, [x26, x27]\n" - "str s21, [x26, x28]\n" - "add x26, x26, #4\n" - "7:\n" - : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) - : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) - : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" - ); -} - -#endif // __aarch64__ - -template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp deleted file mode 100644 index 27bfb843f6..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "impl_dilated.hpp" - -template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>; -template class depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp deleted file mode 100644 index 1bae815613..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include -#include -#include - -#include "depthwise.hpp" - -namespace depthwise -{ - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename TIn, typename TBias, typename TOut -> -class DilatedDepthwiseConvolution : public IDepthwiseConvolution -{ - public: - /** Create a new dilated depthwise convolution engine. - */ - DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - /** Create a new dilated depthwise convolution engine. - */ - DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - // Cannot copy or move a DilatedDepthwiseConvolution. - DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete; - DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete; - - /* Set input tensor and stride. */ - void set_input(const void *inptr) override; - void set_input(const void *inptr, int column_stride) override; - void set_input(const void *inptr, int row_stride, int column_stride) override; - void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override; - - /* Set output tensor and stride. */ - void set_output(void *outptr) override; - void set_output(void *outptr, int column_stride) override; - void set_output(void *outptr, int row_stride, int column_stride) override; - void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override; - - static int get_output_size( - int dim_size, - unsigned int padding_before, - unsigned int padding_after, - int dilation_factor - ); - - int output_size( - int dim_size, unsigned int padding_before, unsigned int padding_after - ) const override; - - /* Weights and biases are re-ordered to improve memory access patterns. Use - * these methods to determine the size of the re-pack buffer and to set the - * address (and implicitly reorder the weights and biases into) the buffer. - */ - size_t get_packed_params_size(void) const override; - void set_packed_params_buffer(void *) override; - - void pack_params(const void *weights, const void *biases=nullptr) const override; - void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override; - void pack_params( - void *buffer, - const void* weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const override; - - /* Working space is used to pad tensors on the fly. Before running any - * inference check the amount of space required, allocate and provide a - * pointer to the convolution engine. - */ - size_t get_working_space_size(unsigned int nthreads=1) const override; - void set_working_space(void *) override; - - unsigned int get_window(void) const override; - void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; - - protected: - /** Protected constructor which also accepts a function to construct a new - * subconvolution - */ - DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right, - std::function subconvfn - ); - - const int _dilation_factor; - const int _n_input_rows, _n_input_cols, _n_channels; - const int _padding_top, _padding_left; - const int _n_output_rows, _n_output_cols; - - /* Dilated depthwise convolution is performed through repeated calls to - * non-dilated convolutions. If the dilation factor is $n$, then we perform - * $(n + 1)^2$ depthwise convolutions. - */ - using BaseDepthwise = DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - TIn, TBias, TOut - >; - std::deque>> _convs; -}; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp deleted file mode 100644 index e56583d6b3..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "depthwise_quantized_dilated.hpp" -#include "impl_dilated.hpp" - -namespace depthwise { - -template -QAsymm8DilatedDepthwiseConvolution:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : QAsymm8DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_rows, padding_top, padding_bottom, dilation_factor), - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_cols, padding_left, padding_right, dilation_factor), - activation, weight_quantisation, input_quantisation, - output_quantisation, padding_top, padding_left, padding_bottom, - padding_right) {} - -template -QAsymm8DilatedDepthwiseConvolution:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : QAsymm8DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - n_output_rows, n_output_cols, activation, weight_quantisation, - input_quantisation, output_quantisation, - qasymm8::QAsymm8RescaleParams::make_rescale_params( - weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right) {} - -template -QAsymm8DilatedDepthwiseConvolution:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams &rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : QAsymm8DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_rows, padding_top, padding_bottom, dilation_factor), - QAsymm8DilatedDepthwiseConvolution::get_output_size( - n_input_cols, padding_left, padding_right, dilation_factor), - activation, weight_quantisation, input_quantisation, - output_quantisation, rescale_parameters, padding_top, padding_left, - padding_bottom, padding_right) {} - -template -QAsymm8DilatedDepthwiseConvolution:: - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams &rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right) - : DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - n_output_rows, n_output_cols, activation, padding_top, padding_left, - padding_bottom, padding_right, - [weight_quantisation, input_quantisation, output_quantisation, - rescale_parameters]( - const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int n_output_rows, const int n_output_cols, - const nck::ActivationFunction activation, - const unsigned int padding_top, const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right) -> IDepthwiseConvolution * { - return new QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, - StrideRows, StrideCols>( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, weight_quantisation, - input_quantisation, output_quantisation, rescale_parameters, - padding_top, padding_left, padding_bottom, padding_right); - }) {} - -} // namespace depthwise - -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>; -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>; -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>; -template class depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>; diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp deleted file mode 100644 index 99f0f53792..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp16_fp16.hpp" - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -namespace depthwise -{ -template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>; -template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>; -template class DepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>; -template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>; -} // namespace depthwise -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp deleted file mode 100644 index c13dd70a61..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_fp32_fp32.hpp" - -namespace depthwise -{ -template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float, float>; -template class DepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>; -template class DepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>; -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp deleted file mode 100644 index bddae51135..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "impl_base.hpp" - -// TODO Move to common utilities somewhere -template struct DType { }; -template <> struct DType<1> { using scalar_type = uint8_t; }; -template <> struct DType<2> { using scalar_type = uint16_t; }; -template <> struct DType<4> { using scalar_type = uint32_t; }; - -namespace depthwise -{ - -template -void PackParameters::execute( - unsigned int n_channels, - void *buffer, - const void *weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void *biases -) -{ - using TWeight = typename DType::scalar_type; - using TBias = typename DType::scalar_type; - - auto buffer_ptr = static_cast(buffer); - auto weights_ptr = static_cast(weights); - auto biases_ptr = static_cast(biases); - - const unsigned int veclen = 16 / WeightSize; - for (; n_channels >= veclen; n_channels -= veclen) - { - // Copy biases - for (unsigned int i = 0; i < veclen; i++) - { - auto ptr = reinterpret_cast(buffer_ptr); - *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++); - buffer_ptr += BiasSize; - } - - // Copy weights - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelColumns; j++) - { - for (unsigned int c = 0; c < veclen; c++) - { - *(reinterpret_cast(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride + c]; - buffer_ptr += WeightSize; - } - } - } - weights_ptr += veclen; - } - for (; n_channels; n_channels--) - { - // Copy bias - auto ptr = reinterpret_cast(buffer_ptr); - *ptr = (biases_ptr == nullptr) ? 0x0 : *(biases_ptr++); - buffer_ptr += BiasSize; - - // Copy weights - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelColumns; j++) - { - *(reinterpret_cast(buffer_ptr)) = weights_ptr[i*weight_row_stride + j*weight_col_stride]; - buffer_ptr += WeightSize; - } - } - weights_ptr++; - } -} - -template struct PackParameters<3, 3, 2ul, 2ul>; -template struct PackParameters<3, 3, 4ul, 4ul>; -template struct PackParameters<5, 5, 2ul, 2ul>; -template struct PackParameters<5, 5, 4ul, 4ul>; -} // namespace diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp deleted file mode 100644 index b09f620475..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_qa8_qa8.hpp" - -namespace depthwise -{ -template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>; -template class QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>; -template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 1, 1>; -template class QAsymm8DepthwiseConvolution<2, 2, 5, 5, 2, 2>; -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp deleted file mode 100644 index 1ae48b9417..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "impl_qa8_qs8_per_channel.hpp" - -namespace depthwise { -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>; -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>; -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>; -template class QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>; -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp deleted file mode 100644 index 4343f6ad45..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once -#include "depthwise.hpp" -#include "qasymm8.hpp" -#include "qsymm8.hpp" -#pragma once - -using namespace neon_convolution_kernels; -using namespace qasymm8; - -inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b) -{ - return vqrdmulhq_s32(a, b); -} - -inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b) -{ - return vqrdmulhq_n_s32(a, b); -} - -inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b) -{ - return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); -} - -inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift) -{ - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); - const int32x4_t fixed = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed, shift); -} - -inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent) -{ - const int32x4_t shift = vdupq_n_s32(-exponent); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); - const int32x4_t fixed = vqaddq_s32(x, fixup); - return vrshlq_s32(fixed, shift); -} - -inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent) -{ - const int32x2_t shift = vdup_n_s32(-exponent); - const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); - const int32x2_t fixed = vqadd_s32(x, fixup); - return vrshl_s32(fixed, shift); -} - -inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent) -{ - const int32x2_t xs = vdup_n_s32(x); - return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); -} - -namespace depthwise -{ - -namespace nck = neon_convolution_kernels; - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QAsymm8DepthwiseConvolution -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QAsymm8DepthwiseConvolution - >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const qasymm8::QAsymm8RescaleParams& rescale_parameters, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const qasymm8::QAsymm8RescaleParams& rescale_parameters, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - protected: - uint8_t _input_padding_value(void) const; - - void _pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const; - - template - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); - - private: - // Quantization parameters - const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant; - const qasymm8::QAsymm8RescaleParams rescale_parameters; -}; - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QSymm8HybridPerChannelDepthwiseConvolution -> -{ - using Base = DepthwiseConvolutionBase< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, - StrideRows, StrideCols, - uint8_t, int32_t, uint8_t, - QSymm8HybridPerChannelDepthwiseConvolution - >; - friend Base; - using InputType = typename Base::InputType; - using OutputType = typename Base::OutputType; - - public: - QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - nck::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right - ); - - size_t get_packed_params_size(void) const override - { - return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t)); - - } - - protected: - uint8_t _input_padding_value(void) const; - - void _pack_params( - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases=nullptr - ) const; - - template - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride - ); - - template - void execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] - ); - - private: - // Quantization parameters - const qsymm8::QSymm8PerChannelParams _weights_quant; - const qasymm8::QAsymm8Params _input_quant, _output_quant; - const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters; -}; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp deleted file mode 100644 index a11b0981c9..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once -#include "depthwise_dilated.hpp" -#include "depthwise_quantized.hpp" - -namespace depthwise { - -template -class QAsymm8DilatedDepthwiseConvolution - : public DilatedDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, - StrideCols, uint8_t, int32_t, uint8_t> { -public: - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); - - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); - - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams &rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); - - /** Create a new dilated depthwise convolution engine. - */ - QAsymm8DilatedDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int dilation_factor, int n_output_rows, int n_output_cols, - nck::ActivationFunction activation, - const qasymm8::QAsymm8Params &weight_quantisation, - const qasymm8::QAsymm8Params &input_quantisation, - const qasymm8::QAsymm8Params &output_quantisation, - const qasymm8::QAsymm8RescaleParams& rescale_parameters, - unsigned int padding_top, unsigned int padding_left, - unsigned int padding_bottom, unsigned int padding_right); -}; - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp deleted file mode 100644 index 266d13d6fc..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp +++ /dev/null @@ -1,505 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include -#include -#include "depthwise.hpp" -#include "padding.hpp" -#include "utils.hpp" - -#pragma once - -#define MEMBERFN(TOUT) template <\ - unsigned int OutputTileRows, unsigned int OutputTileColumns,\ - unsigned int KernelRows, unsigned int KernelColumns,\ - unsigned int StrideRows, unsigned int StrideColumns,\ - typename TIn, typename TBias, typename TOut,\ - typename Derived\ -> TOUT DepthwiseConvolutionBase<\ - OutputTileRows, OutputTileColumns,\ - KernelRows, KernelColumns,\ - StrideRows, StrideColumns,\ - TIn, TBias, TOut, Derived\ -> - -using namespace neon_convolution_kernels; - -namespace depthwise -{ - -template -struct PackParameters -{ - static void execute( - unsigned int n_channels, - void *buffer, - const void *weights, - unsigned int weight_row_stride, - unsigned int weight_col_stride, - const void *biases - ); -}; - -const unsigned int CHANNEL_BLOCK = 16; - -MEMBERFN(int)::get_output_size( - const int dim_size, const unsigned int padding_before, const unsigned int padding_after -) -{ - return iceildiv(dim_size + padding_before + padding_after - KernelRows + 1, StrideRows); -} - -MEMBERFN(int)::output_size( - const int dim_size, const unsigned int padding_before, const unsigned int padding_after -) const -{ - return get_output_size(dim_size, padding_before, padding_after); -} - -MEMBERFN()::DepthwiseConvolutionBase( - const int n_batches, - const int n_input_rows, - const int n_input_cols, - const int n_channels, - ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right -) : DepthwiseConvolutionBase( - n_batches, n_input_rows, n_input_cols, n_channels, - get_output_size(n_input_rows, padding_top, padding_bottom), - get_output_size(n_input_cols, padding_left, padding_right), - activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -MEMBERFN()::DepthwiseConvolutionBase( - const int n_batches, - const int n_input_rows, - const int n_input_cols, - const int n_channels, - const int n_output_rows, - const int n_output_cols, - ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right -) : _input(nullptr), _output(nullptr), - _packed_parameters(nullptr), - _working_space(nullptr), - _n_batches(n_batches), - _n_input_rows(n_input_rows), - _n_input_cols(n_input_cols), - _n_channels(n_channels), - _n_output_rows(n_output_rows), - _n_output_cols(n_output_cols), - _n_tile_rows(iceildiv(_n_output_rows, output_tile_rows)), - _n_tile_cols(iceildiv(_n_output_cols, output_tile_cols)), - _padding_top(padding_top), - _padding_left(padding_left), - _padding_bottom(padding_bottom), - _padding_right(padding_right), - _activation(activation), - _input_col_stride(0), _input_row_stride(0), _input_batch_stride(0), - _output_col_stride(0), _output_row_stride(0), _output_batch_stride(0) -{ -} - -MEMBERFN(void)::set_input(const void* const inptr) -{ - set_input(inptr, _n_channels); -} - -MEMBERFN(void)::set_input(const void* const inptr, const int ld_col) -{ - set_input(inptr, _n_input_cols * ld_col, ld_col); -} - -MEMBERFN(void)::set_input(const void* const inptr, const int ld_row, const int ld_col) -{ - set_input(inptr, _n_input_rows * ld_row, ld_row, ld_col); -} - -MEMBERFN(void)::set_input(const void* const inptr, const int ld_batch, const int ld_row, const int ld_col) -{ - _input = static_cast(inptr); - _input_batch_stride = ld_batch; - _input_row_stride = ld_row; - _input_col_stride = ld_col; -} - -MEMBERFN(void)::set_output(void* const outptr) -{ - set_output(outptr, _n_channels); -} - -MEMBERFN(void)::set_output(void* const outptr, const int ld_col) -{ - set_output(outptr, _n_output_cols * ld_col, ld_col); -} - -MEMBERFN(void)::set_output(void* const outptr, const int ld_row, const int ld_col) -{ - set_output(outptr, _n_output_rows * ld_row, ld_row, ld_col); -} - -MEMBERFN(void)::set_output(void* const outptr, const int ld_batch, const int ld_row, const int ld_col) -{ - _output = static_cast(outptr); - _output_batch_stride = ld_batch; - _output_row_stride = ld_row; - _output_col_stride = ld_col; -} - -MEMBERFN(size_t)::get_packed_params_size(void) const -{ - return _n_channels * (sizeof(TIn)*KernelRows*KernelColumns + sizeof(TBias)); -} - -MEMBERFN(void)::set_packed_params_buffer(void *buffer) -{ - _packed_parameters = buffer; -} - -MEMBERFN(void)::pack_params(const void *weights, const void *biases) const -{ - static_cast(this)->pack_params(_packed_parameters, weights, biases); -} - -MEMBERFN(void)::pack_params(void *buffer, const void *weights, const void *biases) const -{ - const unsigned int weight_col_stride = _n_channels; - const unsigned int weight_row_stride = KernelColumns * weight_col_stride; - static_cast(this)->pack_params( - buffer, weights, weight_row_stride, weight_col_stride, biases - ); -} - -MEMBERFN(void)::pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - static_cast(this)->_pack_params( - buffer, weights, weight_row_stride, weight_col_stride, biases - ); -} - -MEMBERFN(void)::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - // Default implementation - PackParameters::execute( - _n_channels, buffer, weights, weight_row_stride, weight_col_stride, biases - ); -} - -MEMBERFN(size_t)::get_working_space_size(const unsigned int nthreads) const -{ - return nthreads * ( - _get_input_working_space_size() + _get_output_working_space_size() - ); -} - -MEMBERFN(void)::set_working_space(void *buffer) -{ - _working_space = buffer; -} - -MEMBERFN(size_t)::_get_input_working_space_size(void) const -{ - return sizeof(TIn) * _n_channels; -} - -MEMBERFN(size_t)::_get_output_working_space_size(void) const -{ - return sizeof(TOut) * _n_channels; -} - -MEMBERFN(void *)::_get_input_working_space(const unsigned int threadid) const -{ - return static_cast(_working_space) + threadid * ( - _get_input_working_space_size() + _get_output_working_space_size() - ); -} - -MEMBERFN(void *)::_get_output_working_space(const unsigned int threadid) const -{ - return static_cast(_get_input_working_space(threadid)) + _get_input_working_space_size(); -} - -MEMBERFN(unsigned int)::get_window() const -{ - // Parallelise over blocks of channels. - return iceildiv(_n_channels, CHANNEL_BLOCK); -} - -MEMBERFN(void)::run( - const unsigned int start, - const unsigned int stop, - const unsigned int threadid -) -{ - // Clear the input padding buffer - TIn *buf = static_cast(_get_input_working_space(threadid)); - const TIn pad_value = static_cast(this)->_input_padding_value(); - for (int n = 0; n < _n_channels; n++) - { - buf[n] = pad_value; - } - - // Parallelise over blocks of channels - const auto start_channel = CHANNEL_BLOCK * start; - const auto stop_channel = std::min(_n_channels, CHANNEL_BLOCK * stop); - const auto params_size_per_channel = this->get_packed_params_size()/_n_channels; - - // Compute top and bottom padding for input and output - const int input_pad_top = _padding_top; - const int input_pad_left = _padding_left; - constexpr int tile_overlap = kernel_rows - stride_rows; - - // Perform the convolution by calling `process_tile_row` for each tile row in - // each batch. - for (int batch = 0; batch < _n_batches; batch++) - { - const TIn* const inptr_batch = _input + batch*_input_batch_stride; - TOut* const outptr_batch = _output + batch*_output_batch_stride; - - // Loop over rows of tiles - for (int tile_i = 0; tile_i < _n_tile_rows; tile_i++) - { - // Pointer to the row - const int input_row_offset = (tile_i == 0) ? 0 : input_pad_top; - const TIn* const inptr_row = (inptr_batch + ((inner_tile_rows - tile_overlap)*tile_i - input_row_offset)*_input_row_stride); - TOut* const outptr_row = outptr_batch + output_tile_rows * tile_i * _output_row_stride; - - // Input padding (top + bottom) for the row - const int input_row_top = tile_i*(inner_tile_rows - tile_overlap) - input_pad_top; - const int input_row_bottom = input_row_top + inner_tile_rows; - const int input_row_pad_top = (tile_i == 0) ? input_pad_top : 0; - const int input_row_pad_bottom = std::max(0, input_row_bottom - _n_input_rows); - - // Output padding (bottom) for the row - const int output_row_bottom = (tile_i + 1)*output_tile_rows; - const int output_row_pad_bottom = std::max(0, output_row_bottom - _n_output_rows); - - // Get the offset into the packed parameters - const auto params_ptr = static_cast(_packed_parameters) + - start_channel*params_size_per_channel; - - // Process the row - process_tile_row( - threadid, - stop_channel - start_channel, - params_ptr, - inptr_row + start_channel, - outptr_row + start_channel, - input_row_pad_top, input_pad_left, input_row_pad_bottom, - output_row_pad_bottom, - _n_tile_cols, _n_input_cols, _n_output_cols - ); - } - } -} - -MEMBERFN(void)::process_tile_row( - const unsigned int threadid, - const int n_channels, - const void* const packed_params, - const TIn* const inptr, - TOut* const outptr, - const int row_pad_in_top, - const int row_pad_in_left, - const int row_pad_in_bottom, - const int row_pad_out_bottom, - const int n_tiles, - const int n_input_cols, - const int n_output_cols -) -{ - constexpr int tile_overlap = kernel_cols - stride_cols; - - // Loop over columns of tiles - for (int tile_j = 0; tile_j < n_tiles; tile_j++) - { - // Input padding (left + right) for the tile - const int t_pad_in_left = (tile_j == 0) ? row_pad_in_left : 0; - const int t_in_start = tile_j*(inner_tile_cols - tile_overlap) - row_pad_in_left; - const int t_in_end = t_in_start + inner_tile_cols; - const int t_pad_in_right = std::max(0, t_in_end - n_input_cols); - - // Output padding (right) for the tile - const int t_out_end = (tile_j + 1) * output_tile_cols; - const int t_pad_out_right = std::max(0, t_out_end - n_output_cols); - - // Get pointers into the inputs and outputs - const int col_offset = (tile_j == 0) ? 0 : row_pad_in_left; - const TIn* const inptr_col = (inptr + ((inner_tile_cols - tile_overlap)*tile_j - col_offset)*_input_col_stride); - TOut* const outptr_col = outptr + tile_j * output_tile_cols * _output_col_stride; - - // Process just this tile - process_tile( - threadid, n_channels, packed_params, inptr_col, outptr_col, - row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right, // Input paddings - row_pad_out_bottom, t_pad_out_right // Output paddings - ); - } -} - -MEMBERFN(TIn)::_input_padding_value(void) const -{ - return static_cast(0); -} - -MEMBERFN(void)::process_tile( - const unsigned int threadid, - const int n_channels, - const void* const packed_params, - const TIn* const inptr, - TOut* const outptr, - const int pad_in_top, - const int pad_in_left, - const int pad_in_bottom, - const int pad_in_right, - const int pad_out_bottom, - const int pad_out_right -) -{ - Derived * dthis = static_cast(this); - const bool pad_input = pad_in_top || pad_in_left || pad_in_bottom || pad_in_right; - const bool pad_output = pad_out_bottom || pad_out_right; - - if (!pad_input && !pad_output) - { - switch(_activation) - { - case ActivationFunction::ReLU: - dthis->template execute_tile( - n_channels, packed_params, - inptr, _input_row_stride, _input_col_stride, - outptr, _output_row_stride, _output_col_stride - ); - break; - case ActivationFunction::ReLU6: - dthis->template execute_tile( - n_channels, packed_params, - inptr, _input_row_stride, _input_col_stride, - outptr, _output_row_stride, _output_col_stride - ); - break; - default: - dthis->template execute_tile( - n_channels, packed_params, - inptr, _input_row_stride, _input_col_stride, - outptr, _output_row_stride, _output_col_stride - ); - break; - } - } - else - { - // Create arrays of input and output pointers, pointing padded elements to - // the working space padding buffers provided. - const TIn *inptrs[inner_tile_rows][inner_tile_cols]; - for (int i = 0; i < inner_tile_rows; i++) - { - for (int j = 0; j < inner_tile_cols; j++) - { - if (i < pad_in_top || (inner_tile_rows - pad_in_bottom) <= i || - j < pad_in_left || (inner_tile_cols - pad_in_right) <= j) - { - // Padded input - inptrs[i][j] = static_cast(_get_input_working_space(threadid)); - } - else - { - inptrs[i][j] = inptr + (i - pad_in_top)*_input_row_stride + (j - pad_in_left)*_input_col_stride; - } - } - } - - TOut *outptrs[output_tile_rows][output_tile_cols]; - for (int i = 0; i < output_tile_rows; i++) - { - for (int j = 0; j < output_tile_cols; j++) - { - if (i < (output_tile_rows - pad_out_bottom) && - j < (output_tile_cols - pad_out_right)) - { - outptrs[i][j] = outptr + i*_output_row_stride + j*_output_col_stride; - } - else - { - outptrs[i][j] = static_cast(_get_output_working_space(threadid)); - } - } - } - - switch(_activation) - { - case ActivationFunction::ReLU: - dthis->template execute_tile( - n_channels, packed_params, inptrs, outptrs - ); - break; - case ActivationFunction::ReLU6: - dthis->template execute_tile( - n_channels, packed_params, inptrs, outptrs - ); - break; - default: - dthis->template execute_tile( - n_channels, packed_params, inptrs, outptrs - ); - break; - } - } -} - -MEMBERFN(int)::n_channels(void) const -{ - return _n_channels; -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp deleted file mode 100644 index 4130188187..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "depthwise_dilated.hpp" -#include "utils.hpp" - -#define MEMBERFN(TOUT) \ - template \ - TOUT DilatedDepthwiseConvolution - -namespace depthwise { - -MEMBERFN() -::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int dilation_factor, - nck::ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right) - : DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - DilatedDepthwiseConvolution::get_output_size( - n_input_rows, padding_top, padding_bottom, dilation_factor), - DilatedDepthwiseConvolution::get_output_size( - n_input_cols, padding_left, padding_right, dilation_factor), - activation, padding_top, padding_left, padding_bottom, - padding_right) {} - -MEMBERFN() -::DilatedDepthwiseConvolution(const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int dilation_factor, - const int n_output_rows, const int n_output_cols, - nck::ActivationFunction activation, - const unsigned int padding_top, - const unsigned int padding_left, - const unsigned int, // padding_bottom - const unsigned int // padding_right - ) - : DilatedDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, dilation_factor, - n_output_rows, n_output_cols, activation, padding_top, padding_left, - 0, 0, - // Function which creates a new (standard) depthwise convolution - [](const int n_batches, const int n_input_rows, - const int n_input_cols, const int n_channels, - const int n_output_rows, const int n_output_cols, - const nck::ActivationFunction activation, - const unsigned int padding_top, const unsigned int padding_left, - const unsigned int padding_bottom, - const unsigned int padding_right) -> IDepthwiseConvolution * { - return new DepthwiseConvolution< - OutputTileRows, OutputTileColumns, KernelRows, KernelColumns, - StrideRows, StrideColumns, TIn, TBias, TOut>( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, padding_top, - padding_left, padding_bottom, padding_right); - }) {} - -MEMBERFN() -::DilatedDepthwiseConvolution( - const int n_batches, const int n_input_rows, const int n_input_cols, - const int n_channels, const int dilation_factor, const int n_output_rows, - const int n_output_cols, nck::ActivationFunction activation, - const unsigned int padding_top, const unsigned int padding_left, - const unsigned int, // padding_bottom - const unsigned int, // padding_right - std::function - subconvfn // Function to create a new convolution - ) - : _dilation_factor(dilation_factor), _n_input_rows(n_input_rows), - _n_input_cols(n_input_cols), _n_channels(n_channels), - _padding_top(static_cast(padding_top)), - _padding_left(static_cast(padding_left)), - _n_output_rows(n_output_rows), _n_output_cols(n_output_cols), - _convs(_dilation_factor) { - // Instantiate the base convolutions - for (uint32_t i = 0; i < static_cast(_dilation_factor); i++) { - // Compute properties of this row of base convolutions - const int row_top = - i * StrideRows - _padding_top; // -ve values are in the padding - const int row_pad_top = - row_top < 0 ? iceildiv(-row_top, dilation_factor) : 0; - - const int _n_input_rows = iceildiv(n_input_rows - i, dilation_factor); - const int _n_output_rows = iceildiv(n_output_rows - i, dilation_factor); - - for (uint32_t j = 0; j < static_cast(_dilation_factor); j++) { - // Compute properties of the base convolution - const int col_left = - j * StrideColumns - padding_left; // -ve values are in the padding - const int col_pad_left = - col_left < 0 ? iceildiv(-col_left, dilation_factor) : 0; - - const int _n_input_cols = iceildiv(n_input_cols - j, dilation_factor); - const int _n_output_cols = iceildiv(n_output_cols - j, dilation_factor); - - // Create new depthwise convolution engine and include it in the vector - // of engines. The new depthwise convolution engine is created by calling - // the delegate function we received as an argument. - _convs[i].emplace_back(subconvfn( - n_batches, _n_input_rows, _n_input_cols, n_channels, _n_output_rows, - _n_output_cols, activation, - // Note: since we have computed the output tensor size we don't need - // to explicitly provide bottom and right padding values to the - // depthwise convolution. - row_pad_top, col_pad_left, 0, 0)); - } - } -} - -MEMBERFN(void)::set_input(const void *const inptr) { - set_input(inptr, _n_channels); -} - -MEMBERFN(void)::set_input(const void *const inptr, const int ldcol) { - set_input(inptr, _n_input_cols * ldcol, ldcol); -} - -MEMBERFN(void) -::set_input(const void *const inptr, const int ldrow, const int ldcol) { - set_input(inptr, _n_input_rows * ldrow, ldrow, ldcol); -} - -MEMBERFN(void) -::set_input(const void *const inptr, const int ldbatch, const int ldrow, - const int ldcol) { - // Compute dilated strides - const int ldrow_dilated = ldrow * _dilation_factor; - const int ldcol_dilated = ldcol * _dilation_factor; - - // Pass input parameters on to base convolutions - for (uint32_t i = 0; i < static_cast(_dilation_factor); i++) { - const int top_pos = - i * StrideRows - _padding_top + - ((static_cast(i * StrideRows) < _padding_top) - ? iceildiv(_padding_top - i * StrideRows, _dilation_factor) * - _dilation_factor - : 0); - const TIn *const inptr_i = - static_cast(inptr) + top_pos * ldrow; - - for (uint32_t j = 0; j < static_cast(_dilation_factor); j++) { - int left_pos = j * StrideColumns - _padding_left; - while (left_pos < 0) - left_pos += _dilation_factor; - - // Modify the pointer to point to the first element of the dilated input - // tensor, then set the input for this convolution engine. - const void *const inptr_ij = inptr_i + left_pos * ldcol; - _convs[i][j]->set_input(inptr_ij, ldbatch, ldrow_dilated, ldcol_dilated); - } - } -} - -MEMBERFN(void)::set_output(void *const outptr) { - set_output(outptr, _n_channels); -} - -MEMBERFN(void)::set_output(void *const outptr, const int ldcol) { - set_output(outptr, _n_output_cols * ldcol, ldcol); -} - -MEMBERFN(void) -::set_output(void *const outptr, const int ldrow, const int ldcol) { - set_output(outptr, _n_output_rows * ldrow, ldrow, ldcol); -} - -MEMBERFN(void) -::set_output(void *const outptr, const int ldbatch, const int ldrow, - const int ldcol) { - // Compute dilated strides - const int ldrow_dilated = ldrow * _dilation_factor; - const int ldcol_dilated = ldcol * _dilation_factor; - - // Pass input parameters on to base convolutions - for (uint32_t i = 0; i < static_cast(_dilation_factor); i++) { - for (uint32_t j = 0; j < static_cast(_dilation_factor); j++) { - // Modify the pointer to point to the first element of the dilated input - // tensor, then set the input for this convolution engine. - void *const outptr_ij = - static_cast(outptr) + i * ldrow + j * ldcol; - _convs[i][j]->set_output(outptr_ij, ldbatch, ldrow_dilated, - ldcol_dilated); - } - } -} - -MEMBERFN(int) -::get_output_size(const int dim_size, const unsigned int padding_before, - const unsigned int padding_after, const int dilation_factor) { - const int input_size = - dim_size + static_cast(padding_before + padding_after); - const int window_size = (KernelRows - 1) * dilation_factor + 1; - return iceildiv(input_size - window_size + 1, StrideRows); -} - -MEMBERFN(int) -::output_size(const int dim_size, const unsigned int padding_before, - const unsigned int padding_after) const { - return get_output_size(dim_size, padding_before, padding_after, - _dilation_factor); -} - -MEMBERFN(size_t)::get_packed_params_size(void) const { - return _convs[0][0]->get_packed_params_size(); -} - -MEMBERFN(void)::set_packed_params_buffer(void *buffer) { - // Set the buffer for all convolution engines - for (auto &&row : _convs) { - for (auto &&conv : row) { - conv->set_packed_params_buffer(buffer); - } - } -} - -MEMBERFN(void) -::pack_params(const void *const weights, const void *const biases) const { - _convs[0][0]->pack_params(weights, biases); -} - -MEMBERFN(void) -::pack_params(void *const buffer, const void *const weights, - const void *const biases) const { - _convs[0][0]->pack_params(buffer, weights, biases); -} - -MEMBERFN(void) -::pack_params(void *const buffer, const void *const weights, - const unsigned int ldrow, const unsigned int ldcol, - const void *const biases) const { - _convs[0][0]->pack_params(buffer, weights, ldrow, ldcol, biases); -} - -MEMBERFN(size_t)::get_working_space_size(unsigned int nthreads) const { - return _convs[0][0]->get_working_space_size(nthreads); -} - -MEMBERFN(void)::set_working_space(void *const ws) { - // Use the same working space set for all contained depthwise engines. - for (auto &&row : _convs) { - for (auto &&conv : row) { - conv->set_working_space(ws); - } - } -} - -MEMBERFN(unsigned int)::get_window(void) const { - return _convs[0][0]->get_window(); -} - -MEMBERFN(void) -::run(const unsigned int start, const unsigned int stop, - const unsigned int threadid) { - // Run each contained convolution in turn - for (auto &&row : _convs) { - for (auto &&conv : row) { - conv->run(start, stop, threadid); - } - } -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp deleted file mode 100644 index a00a1ef04a..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#include "arm.hpp" -#include "impl_base.hpp" - -#pragma once - -using namespace neon_convolution_kernels; - -namespace depthwise -{ - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float16_t *input, - const unsigned int in_row_stride, - const unsigned int in_col_stride, - float16_t *output, - const unsigned int out_row_stride, - const unsigned int out_col_stride -) -{ - // Instantiate pointers - const float16_t* __restrict__ inptr_base = input; - float16_t* __restrict__ outptr_base = output; - const float16_t* __restrict__ params = static_cast(weights_biases_ptr); - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining >= 8; channels_remaining -= 8) - { - // Load input tile - float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float16_t* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f16(inptr_row + j*in_col_stride); - } - } - inptr_base += 8; - - // Load weights tile - float16x8_t vbias = vld1q_f16(params); - params += 8; - - float16x8_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f16(params); - params += 8; - } - } - - // Perform the convolution - float16x8_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j])); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float16_t* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f16(outptr_row + j*out_col_stride, v[i][j]); - } - } - outptr_base += 8; - } - for (; channels_remaining; channels_remaining--) - { - // Load input tile - float16_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float16_t* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptr_row + j*in_col_stride); - } - } - inptr_base++; - - // Load weights tile - float16_t bias = *(params++); - float16_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float16_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float16_t* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptr_row + j*out_col_stride) = v[i][j]; - } - } - outptr_base++; - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float16_t, float16_t, float16_t ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - // Instantiate pointers - const float16_t* __restrict__ params = static_cast(weights_biases_ptr); - int n = 0; - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining >= 8; channels_remaining -= 8, n += 8) - { - // Load input tile - float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f16(inptrs[i][j] + n); - } - } - - // Load weights tile - float16x8_t vbias = vld1q_f16(params); - params += 8; - - float16x8_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f16(params); - params += 8; - } - } - - // Perform the convolution - float16x8_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j])); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f16(outptrs[i][j] + n, v[i][j]); - } - } - } - for (; channels_remaining; channels_remaining--, n++) - { - // Load input tile - float16_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptrs[i][j] + n); - } - } - - // Load weights tile - float16_t bias = *(params++); - float16_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float16_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptrs[i][j] + n) = v[i][j]; - } - } - } -} - -} // namespace depthwise -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp deleted file mode 100644 index b0d8126a40..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Copyright (c) 2018-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include "arm.hpp" -#include "impl_base.hpp" - -#pragma once - -using namespace neon_convolution_kernels; - -namespace depthwise -{ - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - ActivationFunction activation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float *input, - const unsigned int in_row_stride, - const unsigned int in_col_stride, - float *output, - const unsigned int out_row_stride, - const unsigned int out_col_stride -) -{ - // Instantiate pointers - const float* __restrict__ inptr_base = input; - float* __restrict__ outptr_base = output; - const float* __restrict__ params = static_cast(weights_biases_ptr); - - // Perform the depthwise convolution - int channels_remaining = n_channels; - for (; channels_remaining >= 4; channels_remaining -= 4) - { - // Load input tile - float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f32(inptr_row + j*in_col_stride); - } - } - inptr_base += 4; - - // Load weights tile - float32x4_t vbias = vld1q_f32(params); - params += 4; - - float32x4_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f32(params); - params += 4; - } - } - - // Perform the convolution - float32x4_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f32(outptr_row + j*out_col_stride, v[i][j]); - } - } - outptr_base += 4; - } - for (; channels_remaining; channels_remaining--) - { - // Load input tile - float u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - const float* const inptr_row = inptr_base + i*in_row_stride; - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptr_row + j*in_col_stride); - } - } - inptr_base++; - - // Load weights tile - float bias = *(params++); - float w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - float* const outptr_row = outptr_base + i*out_row_stride; - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptr_row + j*out_col_stride) = v[i][j]; - } - } - outptr_base++; - } -} - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void DepthwiseConvolution< - OutputTileRows, OutputTileCols, - KernelRows, KernelCols, StrideRows, StrideCols, - float, float, float ->::execute_tile( - int n_channels, - const void *weights_biases_ptr, - const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - float *outptrs[Base::output_tile_rows][Base::output_tile_cols] -) -{ - const float* __restrict__ params = static_cast(weights_biases_ptr); - - // Perform the depthwise convolution - int channels_remaining = n_channels; - int n = 0; - for (; channels_remaining >= 4; channels_remaining -= 4, n += 4) - { - // Load input tile - float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = vld1q_f32(inptrs[i][j] + n); - } - } - - // Load weights tile - float32x4_t vbias = vld1q_f32(params); - params += 4; - - float32x4_t w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = vld1q_f32(params); - params += 4; - } - } - - // Perform the convolution - float32x4_t v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - v[out_i][out_j] = vbias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const unsigned int j = base_j + in_j; - - // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]); - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f)); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f)); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - vst1q_f32(outptrs[i][j] + n, v[i][j]); - } - } - } - for (; channels_remaining; channels_remaining--, n++) - { - // Load input tile - float u[Base::inner_tile_rows][Base::inner_tile_cols]; - for (int i = 0; i < Base::inner_tile_rows; i++) - { - for (int j = 0; j < Base::inner_tile_cols; j++) - { - u[i][j] = *(inptrs[i][j] + n); - } - } - - // Load weights tile - float bias = *(params++); - float w[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - w[i][j] = *(params++); - } - } - - // Perform the convolution - float v[OutputTileRows][OutputTileCols]; - for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) - { - for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) - { - // Clear the accumulator - v[out_i][out_j] = bias; - - // Base co-ordinate - const int base_i = out_i * StrideRows; - const int base_j = out_j * StrideCols; - - // Fill the accumulator - for (unsigned int in_i = 0; in_i < KernelRows; in_i++) - { - const unsigned int i = base_i + in_i; - for (unsigned int in_j = 0; in_j < KernelCols; in_j++) - { - const int j = base_j + in_j; - v[out_i][out_j] += w[in_i][in_j] * u[i][j]; - } - } - - // Apply the activation function - if (Activation == ActivationFunction::ReLU || - Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); - } - if (Activation == ActivationFunction::ReLU6) - { - v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); - } - } - } - - // Store the output tile - for (unsigned int i = 0; i < OutputTileRows; i++) - { - for (unsigned int j = 0; j < OutputTileCols; j++) - { - *(outptrs[i][j] + n) = v[i][j]; - } - } - } -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp deleted file mode 100644 index e8b4c7bc0f..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include - -#include "arm.hpp" -#include "impl_base.hpp" -#include "depthwise_quantized.hpp" - -namespace depthwise -{ -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QAsymm8DepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, - activation, weight_quantisation, input_quantisation, output_quantisation, - QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QAsymm8DepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - int n_output_rows, int n_output_cols, - const ActivationFunction activation, - const QAsymm8Params& weight_quantisation, - const QAsymm8Params& input_quantisation, - const QAsymm8Params& output_quantisation, - const QAsymm8RescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, - n_output_rows, n_output_cols, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _inputs_quant(input_quantisation), - _output_quant(output_quantisation), - rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -uint8_t QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_input_padding_value(void) const -{ - return _inputs_quant.offset; -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - const uint8_t *wptr = static_cast(weights); - const int32_t *bptr = static_cast(biases); - uint8_t *outptr = static_cast(buffer); - - // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE - // For SVE set this to half the vector length. - unsigned int veclen = 8; - - // While there are channels left to process, pack a vector length of them at - // a time and reduce the size of vector used as the size of the tensor - // decreases. - for ( - unsigned int n_channels = this->n_channels(); n_channels; - n_channels -= veclen, - outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols) - ) - { - // NOTE Ignore this section if using SVE, the vector length remains the - // same and we just don't fill a full register for the tail. - while (n_channels < veclen) - { - // Reduce the vector length to either 8 or 1 (scalar) - // TODO Support more vector lengths in `execute_tile`. - veclen = (veclen == 16) ? 8 : 1; - } - - // Get pointers to bias and weight portions of the output structure. - int32_t *out_bptr = reinterpret_cast(outptr); - uint8_t *out_wptr = outptr + veclen*sizeof(int32_t); - - // Copy a vector length of elements - for (unsigned int n = 0; n < veclen && n < n_channels; n++) - { - const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; - out_bptr[n] = bias; - - for (unsigned int i = 0; i < KernelRows; i++) - { - uint8_t *row_outptr = out_wptr + i*KernelCols*veclen; - for (unsigned int j = 0; j < KernelCols; j++) - { - uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); - row_outptr[j*veclen + n] = w; - } - } - wptr++; - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void tilefn( - int n_channels, - const void* packed_params, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const int32_t clamp_max, - const int32_t clamp_min, - const uint8_t input_offset, - const uint8_t weight_offset, - const uint8_t output_offset, - const int32_t requant_multiplier, - const int32_t requant_shift -) -{ - constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; - constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; - - // Offset into channels - int channel = 0; - - // Byte type pointer to weights and biases - const uint8_t *wbptr = static_cast(packed_params); - - for (; n_channels >= 8; n_channels -= 8, channel += 8) - { - const int32x4_t biases[2] = { - vld1q_s32(reinterpret_cast(wbptr)), - vld1q_s32(reinterpret_cast(wbptr) + 4), - }; - wbptr += 8*sizeof(int32_t); - - int16x8_t weights[KernelRows][KernelCols]; - const uint8x8_t woffset = vdup_n_u8(weight_offset); - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - const uint8x8_t w = vld1_u8(wbptr); - weights[i][j] = reinterpret_cast(vsubl_u8(w, woffset)); - wbptr += 8; - } - } - - int16x8_t inputs[InnerTileRows][InnerTileCols]; - const uint8x8_t ioffset = vdup_n_u8(input_offset); - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - const auto x = vld1_u8(get_input_ptr(i, j, channel)); - inputs[i][j] = reinterpret_cast(vsubl_u8(x, ioffset)); - } - } - - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32x4_t acc_a = biases[0], acc_b = biases[1]; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj]; - const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; -#ifndef __aarch64__ - acc_a = vmlal_s16(acc_a, vget_low_s16(w), vget_low_s16(x)); - acc_b = vmlal_s16(acc_b, vget_high_s16(w), vget_high_s16(x)); -#else - asm("smlal %[acc_a].4s, %[w].4h, %[x].4h\n" - "smlal2 %[acc_b].4s, %[w].8h, %[x].8h\n" - : [acc_a] "+w"(acc_a), [acc_b] "+w"(acc_b) - : [w] "w"(w), [x] "w"(x)); -#endif // __aarch64__ - } - } - - int32x4_t final_accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - const int32x4_t y = rounding_divide_by_exp2( - saturating_doubling_high_mul((i == 0 ? acc_a : acc_b), requant_multiplier), - requant_shift); - const int32x4_t offset = reinterpret_cast(vdupq_n_u32(output_offset)); - final_accs[i] = vaddq_s32(y, offset); - final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); - final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); - } - -#ifndef __aarch64__ - const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1])); - const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]); - - const int8x16x2_t zoutput = vuzpq_s8(elems, elems); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0])); - vst1_u8(get_output_ptr(oi, oj, channel), output); -#else - const int8x16_t elems = vreinterpretq_s8_s16( - vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1]))); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems))); - vst1_u8(get_output_ptr(oi, oj, channel), output); -#endif // __aarch64__ - } - } - } - for (; n_channels; n_channels--, channel++) - { - // Load bias - const int32_t bias = *reinterpret_cast(wbptr); - wbptr += sizeof(int32_t); - - // Load weights - int16_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - weights[i][j] = *(wbptr++) - weight_offset; - } - } - - // Load the input activations - int16_t inputs[InnerTileRows][InnerTileCols]; - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; - } - } - - // Perform the convolution - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32_t acc = bias; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; - acc += w * x; - } - } - - // Requantize - acc = rounding_divide_by_exp2( - saturating_doubling_high_mul(acc, requant_multiplier), - requant_shift); - acc += output_offset; - acc = std::max(acc, clamp_min); - acc = std::min(acc, clamp_max); - uint8_t output = static_cast(acc); - *(get_output_ptr(oi, oj, channel)) = output; - } - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void execute_tilefn( - int n_channels, - const void* packed_params, - const nck::ActivationFunction actfn, - FInput &get_input_ptr, - FOutput &get_output_ptr, - const QAsymm8Params &input_quant, - const QAsymm8Params &weight_quant, - const QAsymm8Params &output_quant, - const QAsymm8RescaleParams &requant -) { - // Compute min/max clamp values - int32_t clamp_min = std::numeric_limits::min(); - int32_t clamp_max = std::numeric_limits::max(); - - if (actfn == nck::ActivationFunction::ReLU || - actfn == nck::ActivationFunction::ReLU6) { - const int32_t bottom_rail = output_quant.offset; - clamp_min = std::max(clamp_min, bottom_rail); - } - - if (actfn == nck::ActivationFunction::ReLU6) { - const int32_t top_rail = output_quant.quantize(6.0f); - clamp_max = std::min(clamp_max, top_rail); - } - - // Call the tile execution method - tilefn(n_channels, packed_params, get_input_ptr, get_output_ptr, - clamp_max, clamp_min, input_quant.offset, - weight_quant.offset, output_quant.offset, - requant.multiplier, requant.shift); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( - const int i, const int j, const int channel) { - return inptr + i * in_row_stride + j * in_col_stride + channel; - }; - - const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( - const int i, const int j, const int channel) { - return outptr + i * out_row_stride + j * out_col_stride + channel; - }; - - execute_tilefn( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QAsymm8DepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptrs](const int i, const int j, - const int channel) { - return inptrs[i][j] + channel; - }; - - const auto get_output_ptr = [outptrs](const int i, const int j, - const int channel) { - return outptrs[i][j] + channel; - }; - - // Call the tile execution method - execute_tilefn( - n_channels, packed_params, Activation, get_input_ptr, get_output_ptr, - _inputs_quant, _weights_quant, _output_quant, rescale_parameters); -} - -} // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp deleted file mode 100644 index 68e20d98a9..0000000000 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp +++ /dev/null @@ -1,457 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * - * NOTE: Header to be included by implementation files only. - * - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#include - -#include "arm.hpp" -#include "impl_base.hpp" -#include "depthwise_quantized.hpp" - -#pragma once - -namespace { - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void tilefn_hybrid( - int n_channels, - const void* packed_params, - FInput &get_input_ptr, - FOutput &get_output_ptr, - int32_t clamp_min, - int32_t clamp_max, - uint8_t input_offset, - uint8_t output_offset -) -{ - constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows; - constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols; - - // Offset into channels - int channel = 0; - - // Byte type pointer to weights and biases - const int8_t *wbptr = static_cast(packed_params); - - for (; n_channels >= 8; n_channels -= 8, channel += 8) - { - const int32x4_t biases[2] = { - vld1q_s32(reinterpret_cast(wbptr)), - vld1q_s32(reinterpret_cast(wbptr) + 4), - }; - const int32x4_t multipliers[2] = { - vld1q_s32(reinterpret_cast(wbptr) + 8), - vld1q_s32(reinterpret_cast(wbptr) + 12), - }; - const int32x4_t shifts[2] = { - vld1q_s32(reinterpret_cast(wbptr) + 16), - vld1q_s32(reinterpret_cast(wbptr) + 20), - }; - wbptr += 24*sizeof(int32_t); - - int16x8_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - const auto w = vld1_s8(wbptr); - weights[i][j] = reinterpret_cast(vmovl_s8(w)); - wbptr += 8; - } - } - - int16x8_t inputs[InnerTileRows][InnerTileCols]; - const uint8x8_t ioffset = vdup_n_u8(input_offset); - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - const auto x = vld1_u8(get_input_ptr(i, j, channel)); - inputs[i][j] = reinterpret_cast(vsubl_u8(x, ioffset)); - } - } - - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32x4_t accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - accs[i] = biases[i]; - } - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj]; - const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj]; - accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x)); - accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x)); - } - } - - int32x4_t final_accs[2]; - for (unsigned int i = 0; i < 2; i++) - { - const int32x4_t y = rounding_divide_by_exp2( - saturating_doubling_high_mul(accs[i], multipliers[i]), - shifts[i]); - const int32x4_t offset = reinterpret_cast(vdupq_n_u32(output_offset)); - final_accs[i] = vaddq_s32(y, offset); - final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min)); - final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max)); - } - - const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]), - vreinterpretq_s16_s32(final_accs[1])); - const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]); - const uint8x8_t output = - vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0])); - - vst1_u8(get_output_ptr(oi, oj, channel), output); - } - } - } - - for (; n_channels; n_channels--, channel++) - { - // Load bias - const int32_t bias = *reinterpret_cast(wbptr); - const int32_t multiplier = *reinterpret_cast(wbptr + sizeof(int32_t)); - const int32_t shift = *reinterpret_cast(wbptr + 2*sizeof(int32_t)); - - wbptr += 3*sizeof(int32_t); - - // Load weights - int16_t weights[KernelRows][KernelCols]; - for (unsigned int i = 0; i < KernelRows; i++) - { - for (unsigned int j = 0; j < KernelCols; j++) - { - weights[i][j] = *(wbptr++); - } - } - - // Load the input activations - int16_t inputs[InnerTileRows][InnerTileCols]; - for (unsigned int i = 0; i < InnerTileRows; i++) - { - for (unsigned int j = 0; j < InnerTileCols; j++) - { - inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset; - } - } - - // Perform the convolution - for (unsigned int oi = 0; oi < OutputTileRows; oi++) - { - for (unsigned int oj = 0; oj < OutputTileCols; oj++) - { - int32_t acc = bias; - - for (unsigned int wi = 0; wi < KernelRows; wi++) - { - for (unsigned int wj = 0; wj < KernelCols; wj++) - { - const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; - acc += w * x; - } - } - - // Requantize - acc = rounding_divide_by_exp2( - saturating_doubling_high_mul(acc, multiplier), - -shift); - acc += output_offset; - acc = std::max(acc, clamp_min); - acc = std::min(acc, clamp_max); - uint8_t output = static_cast(acc); - *(get_output_ptr(oi, oj, channel)) = output; - } - } - } -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols, - typename FInput, typename FOutput -> -static inline void execute_tilefn_hybrid( - int n_channels, - const void* packed_params, - const ActivationFunction actfn, - const qasymm8::QAsymm8Params &input_quant, - const qasymm8::QAsymm8Params &output_quant, - FInput &get_input_ptr, - FOutput &get_output_ptr) { - - // Compute min/max clamp values - int32_t clamp_min = std::numeric_limits::min(); - int32_t clamp_max = std::numeric_limits::max(); - - if (actfn == ActivationFunction::ReLU) { - clamp_min = output_quant.offset; - } - - // Disabling Relu6 for now - if (actfn == ActivationFunction::ReLU6) { - const int32_t top_rail = output_quant.quantize(6.0f); - clamp_max = std::min(clamp_max, top_rail); - } - - // Call the tile execution method - tilefn_hybrid(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset); -} -} - - - -namespace depthwise { -using namespace qsymm8; -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : QSymm8HybridPerChannelDepthwiseConvolution( - n_batches, n_input_rows, n_input_cols, n_channels, - activation, weight_quantisation, input_quantisation, output_quantisation, - QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation), - padding_top, padding_left, padding_bottom, padding_right - ) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::QSymm8HybridPerChannelDepthwiseConvolution( - int n_batches, int n_input_rows, int n_input_cols, int n_channels, - const ActivationFunction activation, - const QSymm8PerChannelParams& weight_quantisation, - const qasymm8::QAsymm8Params& input_quantisation, - const qasymm8::QAsymm8Params& output_quantisation, - const QSymm8PerChannelRescaleParams& rescale_params, - unsigned int padding_top, - unsigned int padding_left, - unsigned int padding_bottom, - unsigned int padding_right -) : Base( - n_batches, n_input_rows, n_input_cols, n_channels, activation, - padding_top, padding_left, padding_bottom, padding_right - ), - _weights_quant(weight_quantisation), - _input_quant(input_quantisation), - _output_quant(output_quantisation), - _rescale_parameters(rescale_params) -{ -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -uint8_t QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_input_padding_value(void) const -{ - return _input_quant.offset; -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -void QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::_pack_params( - void * const buffer, - const void * const weights, - const unsigned int weight_row_stride, - const unsigned int weight_col_stride, - const void * const biases -) const -{ - const int8_t *wptr = static_cast(weights); - const int32_t *bptr = static_cast(biases); - const int32_t *mptr = static_cast(_rescale_parameters.multipliers.data()); - const int32_t *sptr = static_cast(_rescale_parameters.shifts.data()); - int8_t *outptr = static_cast(buffer); - - // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE - // For SVE set this to half the vector length. - unsigned int veclen = 8; - - // While there are channels left to process, pack a vector length of them at - // a time and reduce the size of vector used as the size of the tensor - // decreases. - for ( - unsigned int n_channels = this->n_channels(); n_channels; - n_channels -= veclen, - outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols) - ) - { - // NOTE Ignore this section if using SVE, the vector length remains the - // same and we just don't fill a full register for the tail. - while (n_channels < veclen) - { - // Reduce the vector length to either 8 or 1 (scalar) - // TODO Support more vector lengths in `execute_tile`. - veclen = (veclen == 16) ? 8 : 1; - } - - // Get pointers to bias and weight portions of the output structure. - int32_t *out_bptr = reinterpret_cast(outptr); - int32_t *out_mptr = reinterpret_cast(outptr + veclen*sizeof(int32_t)); - int32_t *out_sptr = reinterpret_cast(outptr + 2*veclen*sizeof(int32_t)); - int8_t *out_wptr = outptr + 3*veclen*sizeof(int32_t); - - // Copy a vector length of elements - for (unsigned int n = 0; n < veclen && n < n_channels; n++) - { - const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0; - const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0; - const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0; - - out_bptr[n] = bias; - out_mptr[n] = multiplier; - out_sptr[n] = -shift; - - for (unsigned int i = 0; i < KernelRows; i++) - { - int8_t *row_outptr = out_wptr + i*KernelCols*veclen; - for (unsigned int j = 0; j < KernelCols; j++) - { - int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride); - row_outptr[j*veclen + n] = w; - } - } - wptr++; - } - } -} - - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptr, - unsigned int in_row_stride, - unsigned int in_col_stride, - uint8_t* outptr, - unsigned int out_row_stride, - unsigned int out_col_stride -) { - - // Construct methods to get pointers - const auto get_input_ptr = [inptr, in_row_stride, in_col_stride]( - const int i, const int j, const int channel) { - return inptr + i * in_row_stride + j * in_col_stride + channel; - }; - - const auto get_output_ptr = [outptr, out_row_stride, out_col_stride]( - const int i, const int j, const int channel) { - return outptr + i * out_row_stride + j * out_col_stride + channel; - }; - - execute_tilefn_hybrid( - n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr); -} - -template < - unsigned int OutputTileRows, unsigned int OutputTileCols, - unsigned int KernelRows, unsigned int KernelCols, - unsigned int StrideRows, unsigned int StrideCols -> -template -void QSymm8HybridPerChannelDepthwiseConvolution< - OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols ->::execute_tile( - int n_channels, - const void* packed_params, - const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], - uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] -) { - // Construct methods to get pointers - const auto get_input_ptr = [inptrs](const int i, const int j, - const int channel) { - return inptrs[i][j] + channel; - }; - - const auto get_output_ptr = [outptrs](const int i, const int j, - const int channel) { - return outptrs[i][j] + channel; - }; - - // Call the tile execution method - execute_tilefn_hybrid( - n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr); -} - -} // namespace depthwise diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp index 4ddb35f2d5..eac9baaf01 100644 --- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp +++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp @@ -28,7 +28,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp" #include "src/core/NEON/wrapper/traits.h" #include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" @@ -98,6 +97,38 @@ struct DepthwiseConvolutionRunInfo } }; +inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b) +{ + return vqrdmulhq_n_s32(a, b); +} + +inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b) +{ + return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); +} + +inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent) +{ + const int32x2_t shift = vdup_n_s32(-exponent); + const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); + const int32x2_t fixed = vqadd_s32(x, fixup); + return vrshl_s32(fixed, shift); +} + +inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent) +{ + const int32x2_t xs = vdup_n_s32(x); + return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); +} + inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation) { const int32_t current_h = base_h + h * dilation.y(); diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp new file mode 100644 index 0000000000..f5c63b763f --- /dev/null +++ b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/AssemblyUtils.h" + +#include "src/core/NEON/kernels/assembly/depthwise.hpp" + +#include "depthwise_common.hpp" + +#include + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +constexpr unsigned int idx_width = 1; +constexpr unsigned int idx_height = 2; +constexpr unsigned int idx_channels = 0; +constexpr unsigned int idx_batches = 3; + +template +void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, + const ConvolutionInfo &info, const CPUInfo &cpu_info, + std::unique_ptr &kernel) +{ + unsigned int stride_cols{}; + unsigned int stride_rows{}; + std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + const unsigned int kernel_cols = weights->dimension(idx_width); + const unsigned int kernel_rows = weights->dimension(idx_height); + + const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); + + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, + padding, activation, nullptr); + + // Configure assembly pooling kernel + auto dwc_kernel_asm = arm_conv::depthwise::depthwise(args); + if(dwc_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + kernel = std::move(dwc_kernel_asm); +} + +template +void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, + const ConvolutionInfo &info, const CPUInfo &cpu_info, + std::unique_ptr &kernel, + std::vector &multipliers, std::vector &right_shifts, std::vector &left_shifts) +{ + unsigned int stride_cols{}; + unsigned int stride_rows{}; + std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + const unsigned int kernel_cols = weights->dimension(idx_width); + const unsigned int kernel_rows = weights->dimension(idx_height); + + const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); + + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, + padding, activation, nullptr); + + const auto src_qinfo = src->quantization_info().uniform(); + const auto weights_qinfo = weights->quantization_info(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + const unsigned int num_filters = weights_qinfo.scale().size(); + + multipliers.resize(num_filters); + std::vector dst_shifts(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, + weights, + dst, + multipliers.data(), + dst_shifts.data()); + + // Quantize activation bounds + int32_t min_activation = std::numeric_limits::lowest(); + int32_t max_activation = std::numeric_limits::max(); + if(info.act_info.enabled()) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); + } + + // Set quantization parameters for assembly kernels + arm_gemm::Requantize32 requant_args{}; + if(is_data_type_quantized_per_channel(weights->data_type())) + { + left_shifts.resize(num_filters); + right_shifts.resize(num_filters); + bool need_left_shift = false; // Select more optimized path if left shift is not needed + for(unsigned int i = 0; i < num_filters; ++i) + { + left_shifts[i] = std::max(-dst_shifts[i], static_cast(0)); + right_shifts[i] = std::min(-dst_shifts[i], static_cast(0)); + if(dst_shifts[i] < 0 && !need_left_shift) + { + need_left_shift = true; + } + } + + requant_args = arm_gemm::Requantize32(nullptr, + 0, + src_qinfo.offset, + weights_qinfo.uniform().offset, + dst_qinfo.offset, + (need_left_shift) ? left_shifts.data() : nullptr, + right_shifts.data(), + multipliers.data(), + static_cast(min_activation), + static_cast(max_activation)); + } + else + { + requant_args = arm_gemm::Requantize32(nullptr, + 0, + src_qinfo.offset, + weights_qinfo.uniform().offset, + dst_qinfo.offset, + -dst_shifts[0], + multipliers[0], + static_cast(min_activation), + static_cast(max_activation)); + } + + // Configure assembly pooling kernel with requantization + auto dwc_kernel_asm = arm_conv::depthwise::depthwise(args, requant_args); + if(dwc_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + kernel = std::move(dwc_kernel_asm); +} +} // namespace + +CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel() + : _kernel_asm(nullptr), + _multipliers(), + _left_shifts(), + _right_shifts() +{ +} + +CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default; + +void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst, + const ConvolutionInfo &info, const CPUInfo &cpu_info) +{ + ARM_COMPUTE_UNUSED(cpu_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Destination initialization if not yet initialized + const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + +#if defined(__aarch64__) + switch(src->data_type()) + { + case DataType::QASYMM8: + if(is_data_type_quantized_per_channel(weights->data_type())) + { + create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); + } + else + { + create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); + } + break; + case DataType::QASYMM8_SIGNED: + create_arm_dwc_quant(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); + break; +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + case DataType::F16: + create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm); + break; +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + case DataType::F32: + create_arm_dwc(src, weights, dst, info, cpu_info, _kernel_asm); + break; + default: + break; + } +#endif // defined(__aarch64__) + + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + +#if !defined(__aarch64__) + ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); +#endif // !defined(__aarch64__) + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)"); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0)); + + if(is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + } + } + + if(dst->total_size() > 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + return Status{}; +} + +void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); + ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); + + const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); + + const auto src_shape = src->info()->tensor_shape(); + const auto dst_shape = dst->info()->tensor_shape(); + const auto src_padding = src->info()->padding(); + const auto dst_padding = dst->info()->padding(); + + const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; + const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); + const size_t ld_src_batch = ld_src_row * src_shape[2]; + const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; + const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); + const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; + + _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, + parameters_ptr, + dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, + working_space, info.thread_id, info.num_threads); +} + +void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) +{ + _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const +{ + return _kernel_asm->get_storage_size(); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const +{ + return _kernel_asm->get_working_size(num_threads, num_input_channels); +} + +bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const +{ + return _kernel_asm != nullptr; +} + +const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const +{ + return "CpuDepthwiseConv2dAssemblyWrapperKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h new file mode 100644 index 0000000000..8ff44441e9 --- /dev/null +++ b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H + +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_conv +{ +namespace depthwise +{ +// Forward declarations +class IDepthwiseCommon; +} // depthwise +} // arm_conv + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** This class is a wrapper for the depthwise convolution assembly kernels. */ +class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel +{ +public: + /** Default constructor */ + CpuDepthwiseConv2dAssemblyWrapperKernel(); + ~CpuDepthwiseConv2dAssemblyWrapperKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel); + + /** Initialise the kernel's src and dst. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data type supported: same as @p input. + * @param[in] info Depthwise convolution layer meta-data. + * @param[in] cpu_info CPU information needed to select the most appropriate kernel. + */ + void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure() + * + * @return a status. + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Pack bias and weights in a storage space for the assembly kernel + * + * @param[in] parameters_ptr Pointer to storage space. + * @param[in] bias_ptr Pointer to bias buffer. + * @param[in] weights_ptr Pointer to weights buffer. + * @param[in] ld_weights_col Columns displacement for the weights tensor. + * @param[in] ld_weights_row Rows displacement for the weights tensor. + */ + void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); + + /** Get the amount of storage space required for the rearranged weights and bias. + * + * @return size of workspace + */ + size_t get_storage_size() const; + + /** Get size of the workspace needed by the assembly kernel. + * + * @param[in] num_threads Maximum number of threads that are going to be spawned. + * @param[in] num_input_channels Number of channels of the input tensor. + * + * @return size of workspace + */ + size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const; + + /** Was the asm kernel successfully configured? + * + * @return True if the asm kernel is configured and ready to run + */ + bool is_configured() const; + +private: + std::unique_ptr _kernel_asm; + std::vector _multipliers{}; + std::vector _left_shifts{}; + std::vector _right_shifts{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp index c78ffb9848..89dd27a20a 100644 --- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp +++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp @@ -43,11 +43,13 @@ using namespace arm_compute::misc::shape_calculator; void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) { + ARM_COMPUTE_UNUSED(cpu_info); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // dst initialization if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info))); +#if defined(__aarch64__) const bool requantize = src->quantization_info() != dst->quantization_info(); switch(src->data_type()) @@ -83,6 +85,7 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn default: break; } +#endif // defined(__aarch64__) Window win = calculate_max_window(*dst, Steps()); INEKernel::configure(win); @@ -192,7 +195,7 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, arm_conv::pooling::PoolingStride stride{}; std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; constexpr unsigned int idx_width = 1; constexpr unsigned int idx_height = 2; @@ -231,7 +234,7 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf arm_conv::pooling::PoolingStride stride{}; std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; constexpr unsigned int idx_width = 1; constexpr unsigned int idx_height = 2; diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp new file mode 100644 index 0000000000..1e8a2a54c9 --- /dev/null +++ b/src/core/utils/AssemblyUtils.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/utils/AssemblyUtils.h" + +namespace arm_compute +{ +namespace assembly_utils +{ +arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) +{ + arm_gemm::Activation gemm_act; + + // Early exit in case lower bound is other than 0, as it's not yet supported + if(act.b() != 0.f) + { + return gemm_act; + } + + switch(act.activation()) + { + case ActivationLayerInfo::ActivationFunction::RELU: + gemm_act.type = arm_gemm::Activation::Type::ReLU; + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; + gemm_act.param1 = act.a(); + gemm_act.param2 = 0.f; + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; + gemm_act.param1 = act.a(); + gemm_act.param2 = act.b(); + break; + default: + gemm_act.type = arm_gemm::Activation::Type::None; + } + + return gemm_act; +} + +arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info) +{ + return arm_conv::PaddingValues{ pad_stride_info.pad_left(), + pad_stride_info.pad_top(), + pad_stride_info.pad_right(), + pad_stride_info.pad_bottom() }; +} +} // namespace assembly_utils +} // namespace arm_compute diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h new file mode 100644 index 0000000000..e682973827 --- /dev/null +++ b/src/core/utils/AssemblyUtils.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef UTILS_CORE_ASSEMBLY_UTILS_H +#define UTILS_CORE_ASSEMBLY_UTILS_H + +#include "arm_compute/core/Types.h" +#include "src/core/NEON/kernels/assembly/common.hpp" +#include "src/core/cpu/kernels/assembly/arm_gemm.hpp" + +namespace arm_compute +{ +namespace assembly_utils +{ +/** Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure. + * + * @param[in] act Compute Library activation info. + * + * @return Assembly activation info. + */ +arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act); + +/** Performs a mapping between Compute Library PadStrideInfo and the assembly PaddingValues structure. + * + * @param[in] pad_stride_info Compute Library padding and strides info. + * + * @return Assembly padding values. + */ +arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info); +} // namespace assembly +} // namespace arm_compute +#endif /* UTILS_CORE_ASSEMBLY_UTILS_H */ diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp index 49e39f663f..81e813caff 100644 --- a/src/core/utils/quantization/AsymmHelpers.cpp +++ b/src/core/utils/quantization/AsymmHelpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -185,16 +185,15 @@ std::pair get_min_max_values_from_quantized_data_type(DataType data_ty void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, - unsigned int idx_ofms, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr) { - const unsigned int num_filters = is_data_type_quantized_per_channel(weights->data_type()) ? weights->dimension(idx_ofms) : 1; - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const QuantizationInfo wq_info = weights->quantization_info(); const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + const unsigned int num_filters = wq_info.scale().size(); + for(unsigned int i = 0; i < num_filters; ++i) { int32_t output_multiplier = 0; diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 6467caffef..c7520cd087 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -307,11 +307,9 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() { _output_multipliers.map(); _output_shifts.map(); - const unsigned int idx_ofms = _needs_permute ? 2 : 0; quantization::compute_quantized_multipliers_and_shifts(_input->info(), _original_weights->info(), _output->info(), - idx_ofms, reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); @@ -513,11 +511,9 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar { _output_multipliers.map(); _output_shifts.map(); - const unsigned int idx_ofms = _is_nhwc ? 0 : 2; quantization::compute_quantized_multipliers_and_shifts(_input->info(), _original_weights->info(), _output->info(), - idx_ofms, reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 3184d5dfe0..188f3b8819 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -352,7 +352,6 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, quantization::compute_quantized_multipliers_and_shifts(input->info(), weights->info(), output->info(), - idx_kernels, gemmlowp_output_stage.gemmlowp_multipliers.data(), gemmlowp_output_stage.gemmlowp_shifts.data()); gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; @@ -562,7 +561,6 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI quantization::compute_quantized_multipliers_and_shifts(input, weights, output, - idx_kernels, gemmlowp_output_stage.gemmlowp_multipliers.data(), gemmlowp_output_stage.gemmlowp_shifts.data()); gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index a561b88058..daa5fd5ab9 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -137,9 +137,10 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: // Allocate memory based on the internal memory requirements experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace(); - _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment); - _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment); - + _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment); + _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment); + _memory_group.manage(&_impl->workspace); + _memory_group.manage(&_impl->packed_weights); _impl->workspace.allocator()->allocate(); _impl->packed_weights.allocator()->allocate(); } diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp index 160a9fd70b..f577e94def 100644 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp @@ -62,8 +62,8 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); - //Validate Activation Layer - if(info.act_info.enabled()) + // Validate Activation Layer + if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -95,15 +95,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI _is_prepared = false; // Configure pipeline - ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); - const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); - _is_activationlayer_enabled = info.act_info.enabled() && !(is_relu || is_relu6); - - if(!_is_activationlayer_enabled) - { - act_info_to_use = info.act_info; - } + _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); _dwc_optimized_func = std::make_unique(); if(_is_nchw) @@ -359,7 +351,7 @@ Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo } // Validate Activation Layer - if(info.act_info.enabled()) + if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h index 049397fe60..ae9f894aab 100644 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h @@ -92,9 +92,8 @@ private: * * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present - * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of dst is required - * -# @ref NEActivationLayer if fused activation is required + * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present + * -# @ref CpuActivation if fused activation is required * */ class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index a36ee1d45b..660ac0163c 100644 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -24,315 +24,22 @@ #include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h" -#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp" -#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp" +#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" #include "src/core/helpers/AutoConfiguration.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include +#include "src/core/utils/AssemblyUtils.h" namespace arm_compute { namespace cpu { -namespace -{ -std::unique_ptr get_qasymm8_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qasymm8::QAsymm8RescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr get_qsymm8_perchannel_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - neon_convolution_kernels::ActivationFunction activation, - const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo, - const qsymm8::QSymm8PerChannelRescaleParams &rescale_params, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -std::unique_ptr get_fp16_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -std::unique_ptr get_fp32_convolver(int kernel_size, int stride_x, - int n_batches, int in_rows, int in_cols, int n_channels, - int dilation_factor, neon_convolution_kernels::ActivationFunction activation, - int padding_top, int padding_left, int padding_bottom, int padding_right) -{ - switch(kernel_size) - { - case 3: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - case 5: - { - switch(stride_x) - { - case 1: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - case 2: - return std::make_unique>( - n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - default: - return nullptr; - } - } - default: - return nullptr; - } -} - -std::unique_ptr create_convolver(const ITensorInfo *src, - const ITensorInfo *weights, - ITensorInfo *output, - const ConvolutionInfo &info) -{ - const DataType data_type = src->data_type(); - const TensorShape shape = src->tensor_shape(); - - const int n_batches = shape[3]; - const int in_rows = shape.z(); - const int in_cols = shape.y(); - const int n_channels = shape.x(); - const int dilation_factor = info.dilation.x(); - const int padding_top = info.pad_stride_info.pad_top(); - const int padding_left = info.pad_stride_info.pad_left(); - const int padding_bottom = info.pad_stride_info.pad_bottom(); - const int padding_right = info.pad_stride_info.pad_right(); - - const bool is_uniform_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8); - const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - - const unsigned int stride_x = info.pad_stride_info.stride().first; - const unsigned int kernel_size = weights->tensor_shape().y(); - - // Map activation function - neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None; - if(arm_compute::utils::info_helpers::is_relu(info.act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU; - } - else if(arm_compute::utils::info_helpers::is_relu6(info.act_info)) - { - activation = neon_convolution_kernels::ActivationFunction::ReLU6; - } - - // Create quantized convolver - if(is_uniform_quantized) - { - const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform(); - const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; - const qasymm8::QAsymm8Params wqinfo{ static_cast(weights_qinfo.offset), weights_qinfo.scale }; - const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler); - - return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else if(is_perchannel_quantized) - { - const UniformQuantizationInfo input_qinfo = src->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform(); - - // Check that quantization info are in the range [0, 255] - ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255); - ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255); - const qasymm8::QAsymm8Params iqinfo{ static_cast(input_qinfo.offset), input_qinfo.scale }; - const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() }; - const qasymm8::QAsymm8Params oqinfo{ static_cast(output_qinfo.offset), output_qinfo.scale }; - - // Calculate rescale parameters - std::vector fmultipliers; - std::vector qmultipliers; - std::vector qshifts; - - for(auto const s : wqinfo.scales) - { - const float fmultipler = iqinfo.scale * s / oqinfo.scale; - int32_t qmultiplier = 0; - int32_t qshift = 0; - quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift); - fmultipliers.push_back(fmultipler); - qmultipliers.push_back(qmultiplier); - qshifts.push_back(qshift); - } - - qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers); - - return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation, - wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right); - } - else - { - // Create float convolver - switch(data_type) - { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - { - return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right); - } - default: - return nullptr; - } - } -} -} // namespace - struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl { - std::unique_ptr dwc_assembly_kernel{ nullptr }; - NEDepthwiseConvolutionAssemblyKernelWrapper dwc_acl_kernel{}; - bool is_prepared{ false }; - experimental::MemoryRequirements mem_req{}; + std::unique_ptr asm_kernel{ nullptr }; + bool is_prepared{ false }; + experimental::MemoryRequirements mem_req{}; }; #ifndef DOXYGEN_SKIP_THIS @@ -350,206 +57,71 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, ITensorInfo *dst, const ConvolutionInfo &info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_UNUSED(bias); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dAssemblyDispatch::validate(src, - weights, - bias != nullptr ? bias : nullptr, - dst, - info)); - - // Output auto inizialitation if not yet initialized - const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dst_shape).set_quantization_info(dst->quantization_info())); - - _pImpl->is_prepared = false; + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + _pImpl->is_prepared = false; - // Create convolver - _pImpl->dwc_assembly_kernel = create_convolver(src, weights, dst, info); - ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr); - - // Create assembly kernel wrapper - _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get()); - - constexpr size_t alignment = 128; - - // Create workspace - const unsigned int num_threads = NEScheduler::get().num_threads(); - const size_t workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads); - ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !"); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment }); + // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() + if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) + { + return; + } - // Create packing tensor - const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size(); - ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !"); + auto dwc_wrapper = std::make_unique(); + ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr); + dwc_wrapper->configure(src, weights, bias, dst, info, ci); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment }); + // Compute memory requirements for assembly kernels + constexpr size_t alignment = 4096; + _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads, src->dimension(0)), alignment }); + _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment }); + _pImpl->asm_kernel = std::move(dwc_wrapper); } -experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) { - return _pImpl->mem_req; + return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); } -Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *bias, - const ITensorInfo *dst, - const ConvolutionInfo &info) +experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const { - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - - // Validate convolver - ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(src, weights, info)); - - // Validate activation - const bool is_relu = arm_compute::utils::info_helpers::is_relu(info.act_info); - const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info); - ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6)); - - // Check bias - if(bias != nullptr) - { - unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx)); - } - - // Check output - if(dst->total_size() != 0) - { - const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - } - - // The uniform quantization case will only have 1 scale value in the weights quantization info - const UniformQuantizationInfo src_qinfo = src->quantization_info().uniform(); - const QuantizationInfo weights_qinfo = weights->quantization_info(); - const UniformQuantizationInfo dst_qinfo = dst->quantization_info().uniform(); - for(auto const s : weights_qinfo.scale()) - { - const float fmultipler = src_qinfo.scale * s / dst_qinfo.scale; - ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f); - } - - return Status{}; + return _pImpl->mem_req; } -bool CpuDepthwiseConv2dAssemblyDispatch::is_optimized_supported(const ITensorInfo *src, - const ITensorInfo *weights, - const ConvolutionInfo &info) +bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - - // Reshape input shape if in NHWC format - const DataLayout data_layout = src->data_layout(); - TensorShape in_shape{ src->tensor_shape() }; - if(data_layout == DataLayout::NHWC) - { - in_shape.set(Window::DimX, src->tensor_shape().y()); - in_shape.set(Window::DimY, src->tensor_shape().z()); - in_shape.set(Window::DimZ, src->tensor_shape().x()); - } - - // Check data type - const DataType input_type = src->data_type(); - const bool is_input_type_valid = is_data_type_float(input_type) || input_type == DataType::QASYMM8; - const DataType weights_type = weights->data_type(); - const bool is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED - || weights_type == DataType::QSYMM8_PER_CHANNEL; - - // Check weighs size - std::set supported_kernel_sizes = { 3, 5 }; - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int kernel_w = weights->dimension(width_idx); - const unsigned int kernel_h = weights->dimension(height_idx); - bool weights_supported = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0); - - // Check for supported strides - const auto &strides = info.pad_stride_info.stride(); - bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2)); - - // Check for supported padding - const auto pad_top = info.pad_stride_info.pad_top(); - const auto pad_right = info.pad_stride_info.pad_right(); - const auto pad_bottom = info.pad_stride_info.pad_bottom(); - const auto pad_left = info.pad_stride_info.pad_left(); - PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation); - bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left()); - bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0); - bool supported_padding = is_same_padding || is_valid_padding; - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1)); - - if(weights_type == DataType::QSYMM8_PER_CHANNEL) - { - is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U)); - } - - return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported; + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); + return act.type != arm_gemm::Activation::Type::None; } void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) { - // Prepare assembly kernel - prepare(tensors); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto workspace = tensors.get_tensor(TensorType::ACL_INT_0); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - // Setup inputs/outputs - ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr); - _pImpl->dwc_assembly_kernel->set_working_space(static_cast(workspace->buffer())); - - ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr); - const int input_element_size = src->info()->element_size(); - const int input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size; - const int input_row_stride = src->info()->strides_in_bytes().z() / input_element_size; - const int input_col_stride = src->info()->strides_in_bytes().y() / input_element_size; - const void *input_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride); - - ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr); - const int output_element_size = dst->info()->element_size(); - const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size; - const int output_row_stride = dst->info()->strides_in_bytes().z() / output_element_size; - const int output_col_stride = dst->info()->strides_in_bytes().y() / output_element_size; - void *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride); + prepare(tensors); - // Schedule assembly kernel - NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX); + NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors); } void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) { if(!_pImpl->is_prepared) { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1); + // Pack weights and bias + const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); - ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr); + const auto weights_ptr = weights->buffer() + weights->info()->offset_first_element_in_bytes(); + const auto bias_ptr = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr; + auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); - // Pack weights and bias - const int weights_element_size = weights->info()->element_size(); - const int weights_row_stride = weights->info()->strides_in_bytes().z() / weights_element_size; - const int weights_col_stride = weights->info()->strides_in_bytes().y() / weights_element_size; - _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(), - weights->buffer() + weights->info()->offset_first_element_in_bytes(), - weights_row_stride, - weights_col_stride, - (bias != nullptr) ? bias->buffer() : nullptr); - _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer()); + const auto weights_shape = weights->info()->tensor_shape(); + const auto weights_padding = weights->info()->padding(); + + const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; + const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); + _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); weights->mark_as_unused(); if(bias != nullptr) diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h index 195942b7fd..70845163f4 100644 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h +++ b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H -#define ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H #include "src/core/common/Macros.h" #include "src/runtime/cpu/ICpuOperator.h" @@ -40,15 +40,15 @@ public: ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch); /** Default destructor */ ~CpuDepthwiseConv2dAssemblyDispatch(); - /** Initialize the function's source, destination, kernels and border_size. * * @note Supports only NHWC format * - * @param[in] src Source tensor info. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p src. + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. + * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src. + * Data type supported: same as @p src or S32 if @p src is quantized. * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ @@ -60,18 +60,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); - /** Check if the optimized kernel can be used for the given kernel sizes and strides - * - * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC + /** Checks if activation is supported by the assembly kernels * - * @param[in] src Input tensor info. - * @param[in] weights Weights tensor info. - * @param[in] info Depthwise convolution meta-data. + * @param[in] activation Activation to check * - * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. + * @return True if activation is supported else false */ - static bool is_optimized_supported(const ITensorInfo *src, const ITensorInfo *weights, const ConvolutionInfo &info); - + static bool is_activation_supported(const ActivationLayerInfo &activation); // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; @@ -83,4 +78,4 @@ private: }; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DASSEMBLYDISPATCH_H */ +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */ diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index ea3742fee5..1101e05a0d 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -27,6 +27,7 @@ #include "src/core/CPP/Validate.h" #include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/core/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/core/utils/AssemblyUtils.h" #include #include @@ -89,38 +90,6 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen return p; } -arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act) -{ - arm_gemm::Activation gemm_act; - - // Early exit in case lower bound is other than 0, as it's not yet supported - if(act.b() != 0.f) - { - return gemm_act; - } - - switch(act.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - gemm_act.type = arm_gemm::Activation::Type::ReLU; - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; - gemm_act.param1 = act.a(); - gemm_act.param2 = 0.f; - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - gemm_act.type = arm_gemm::Activation::Type::BoundedReLU; - gemm_act.param1 = act.a(); - gemm_act.param2 = act.b(); - break; - default: - gemm_act.type = arm_gemm::Activation::Type::None; - } - - return gemm_act; -} - IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type) { // Schedule assembly kernel @@ -788,14 +757,14 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) { - arm_gemm::Activation act = map_to_arm_gemm_activation(activation); + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); return act.type != arm_gemm::Activation::Type::None; } void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info); + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h index 4fd461dd9d..a19e7ee8cf 100644 --- a/tests/datasets/DepthwiseConvolutionLayerDataset.h +++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h @@ -220,8 +220,6 @@ public: add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)); add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), Size2D(2U, 2U)); add_config(TensorShape(9U, 9U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, DimensionRoundingType::CEIL)); - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - // add_config(TensorShape(9U, 9U, 1U), Size2D(3U, 3U), PadStrideInfo(2, 2, 2, 2, DimensionRoundingType::CEIL), Size2D(2U, 2U)); } }; /** Dataset containing optimized, 3x3 depthwise convolution shapes. */ @@ -259,11 +257,9 @@ public: add_config(TensorShape(7U, 7U, 16U), Size2D(5U, 5U), PadStrideInfo(1, 1, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U)); // Stride 2 add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)); - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - // add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), Size2D(2U, 2U)); + add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), Size2D(2U, 2U)); add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 2, 2, 2, 2, DimensionRoundingType::CEIL)); - // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported - // add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 4, 4, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U)); + add_config(TensorShape(9U, 9U, 32U), Size2D(5U, 5U), PadStrideInfo(2, 2, 4, 4, 4, 4, DimensionRoundingType::CEIL), Size2D(2U, 2U)); } }; } // namespace datasets diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h index 2c943735ca..19ec6b2560 100644 --- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h +++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h @@ -117,8 +117,7 @@ public: void allocate_and_run_target() { - // TODO: uncomment after COMPMID-4361 - // add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout); + add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout); // Allocate tensors _src.allocator()->allocate(); -- cgit v1.2.1