From a4bba9c594c4022c9f85192bb8fd3593ad1a8d3c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 2 Apr 2019 15:27:52 +0100 Subject: COMPMID-1995: Fix 32-bit NEDepthwiseConvolution errors. -Updates padding handling in assembly depthwise kernels. -Fixes 32-bit runs issues for depthwise convolution. Change-Id: I3fe6369397c1d13f5629dd34c068ce4af53c95cd Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/939 Reviewed-by: Giuseppe Rossini Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp | 1515 +++++++++++++ .../depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp | 2366 ++++++++++++++++++++ .../convolution/depthwise/impl_fp16_fp16.hpp | 172 +- .../convolution/depthwise/impl_fp32_fp32.hpp | 169 +- .../kernels/convolution/depthwise/impl_qa8_qa8.hpp | 325 +++ 5 files changed, 4542 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp index 4ac6276123..010dd81bce 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp @@ -429,6 +429,491 @@ void Conv::execute_tile( ); } +template <> +template <> +void Conv::execute_tile( + int n_channels, + const void *weight_bias_ptr, + const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + float *outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + __asm __volatile( + "mov x23, xzr\n" + "mov x24, xzr\n" + "and x25, %[n_channels], #3\n" + "lsr x26, %[n_channels], #2\n" + "cbz x26, 4f\n" + "1:\n" + "ldr q13, [%[wbptr]]\n" + "ldr x19, [%[inptrs], 0]\n" + "mov v10.16b, v13.16b\n" + "ldr q12, [%[wbptr], #16]\n" + "mov v8.16b, v13.16b\n" + "ldr q6, [%[wbptr], #32]\n" + "mov v9.16b, v13.16b\n" + "ldr q5, [%[wbptr], #48]\n" + "mov v7.16b, v13.16b\n" + "ldr q11, [%[wbptr], #64]\n" + "ldr q4, [%[wbptr], #80]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr q3, [%[wbptr], #96]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr q2, [%[wbptr], #112]\n" + "ldr x27, [%[inptrs], 120]\n" + "ldr q1, [%[wbptr], #128]\n" + "subs x26, x26, #1\n" + "ldr q0, [%[wbptr], #144]\n" + "ldr q14, [x19, x23]\n" + "fmla v10.4s, v14.4s, v12.4s\n" + "ldr q18, [x20, x23]\n" + "ldr q14, [x21, x23]\n" + "ldr x19, [%[inptrs], 8]\n" + "ldr q16, [x27, x23]\n" + "ldr x20, [%[inptrs], 48]\n" + "ldr q19, [x19, x23]\n" + "ldr x21, [%[inptrs], 88]\n" + "fmla v10.4s, v18.4s, v11.4s\n" + "ldr q15, [x20, x23]\n" + "ldr q18, [x21, x23]\n" + "ldr x19, [%[inptrs], 16]\n" + "ldr q13, [x19, x23]\n" + "fmla v10.4s, v19.4s, v6.4s\n" + "fmla v10.4s, v14.4s, v2.4s\n" + "beq 3f\n" + "2:\n" + "fmla v8.4s, v14.4s, v12.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v10.4s, v15.4s, v4.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v9.4s, v13.4s, v12.4s\n" + "ldr q14, [x20, x23]\n" + "ldr q17, [x19, x23]\n" + "ldr x22, [%[inptrs], 160]\n" + "fmla v8.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 128]\n" + "fmla v10.4s, v13.4s, v5.4s\n" + "ldr q15, [x22, x23]\n" + "fmla v9.4s, v14.4s, v11.4s\n" + "ldr q19, [x27, x23]\n" + "ldr x21, [%[inptrs], 96]\n" + "ldr x20, [%[inptrs], 64]\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v8.4s, v18.4s, v6.4s\n" + "ldr x22, [%[inptrs], 168]\n" + "fmla v10.4s, v18.4s, v1.4s\n" + "ldr q13, [x21, x23]\n" + "fmla v9.4s, v17.4s, v6.4s\n" + "ldr q18, [x20, x23]\n" + "fmla v7.4s, v13.4s, v12.4s\n" + "ldr q17, [x19, x23]\n" + "fmla v8.4s, v15.4s, v2.4s\n" + "ldr q15, [x22, x23]\n" + "fmla v10.4s, v14.4s, v3.4s\n" + "ldr x27, [%[inptrs], 136]\n" + "fmla v9.4s, v13.4s, v2.4s\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr q16, [x27, x23]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v8.4s, v19.4s, v4.4s\n" + "ldr q19, [x21, x23]\n" + "fmla v10.4s, v13.4s, v0.4s\n" + "ldr q12, [x20, x23]\n" + "fmla v9.4s, v18.4s, v4.4s\n" + "ldr x22, [%[inptrs], 176]\n" + "fmla v7.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 144]\n" + "fmla v8.4s, v13.4s, v5.4s\n" + "ldr q11, [x22, x23]\n" + "ldr q13, [x27, x23]\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v9.4s, v17.4s, v5.4s\n" + "ldr x22, [%[inptrs], 184]\n" + "fmla v7.4s, v19.4s, v6.4s\n" + "ldr q14, [x21, x23]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr q17, [x22, x23]\n" + "ldr x27, [%[inptrs], 152]\n" + "ldr x22, [%[inptrs], 192]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v9.4s, v19.4s, v1.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str q10, [x21, x24]\n" + "fmla v7.4s, v11.4s, v2.4s\n" + "fmla v8.4s, v16.4s, v3.4s\n" + "ldr q16, [x27, x23]\n" + "ldr q15, [x22, x23]\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v9.4s, v12.4s, v3.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v7.4s, v13.4s, v4.4s\n" + "ldr q13, [%[wbptr]]\n" + "fmla v8.4s, v11.4s, v0.4s\n" + "ldr q12, [%[wbptr], #16]\n" + "mov v10.16b, v13.16b\n" + "ldr q6, [%[wbptr], #32]\n" + "fmla v9.4s, v14.4s, v0.4s\n" + "ldr q11, [%[wbptr], #64]\n" + "fmla v7.4s, v14.4s, v5.4s\n" + "ldr q4, [%[wbptr], #80]\n" + "str q8, [x28, x24]\n" + "add x23, x23, #16\n" + "mov v8.16b, v13.16b\n" + "ldr q2, [%[wbptr], #112]\n" + "str q9, [x21, x24]\n" + "ldr x28, [%[outptrs], 24]\n" + "fmla v7.4s, v17.4s, v1.4s\n" + "ldr q5, [%[wbptr], #48]\n" + "mov v9.16b, v13.16b\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "ldr x19, [%[inptrs], 0]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr x27, [%[inptrs], 120]\n" + "subs x26, x26, #1\n" + "fmla v7.4s, v16.4s, v3.4s\n" + "ldr q1, [%[wbptr], #128]\n" + "ldr q14, [x19, x23]\n" + "fmla v10.4s, v14.4s, v12.4s\n" + "ldr q18, [x20, x23]\n" + "ldr q14, [x21, x23]\n" + "ldr x19, [%[inptrs], 8]\n" + "fmla v7.4s, v15.4s, v0.4s\n" + "ldr q3, [%[wbptr], #96]\n" + "ldr q19, [x19, x23]\n" + "ldr x20, [%[inptrs], 48]\n" + "fmla v10.4s, v18.4s, v11.4s\n" + "ldr q16, [x27, x23]\n" + "ldr q15, [x20, x23]\n" + "ldr x19, [%[inptrs], 16]\n" + "str q7, [x28, x24]\n" + "ldr x21, [%[inptrs], 88]\n" + "mov v7.16b, v13.16b\n" + "ldr q0, [%[wbptr], #144]\n" + "fmla v10.4s, v19.4s, v6.4s\n" + "ldr q13, [x19, x23]\n" + "ldr q18, [x21, x23]\n" + "add x24, x24, #16\n" + "fmla v10.4s, v14.4s, v2.4s\n" + "bne 2b\n" + "3:\n" + "fmla v8.4s, v14.4s, v12.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v10.4s, v15.4s, v4.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v9.4s, v13.4s, v12.4s\n" + "ldr q14, [x20, x23]\n" + "ldr q17, [x19, x23]\n" + "ldr x22, [%[inptrs], 160]\n" + "fmla v8.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 128]\n" + "fmla v10.4s, v13.4s, v5.4s\n" + "ldr q15, [x22, x23]\n" + "fmla v9.4s, v14.4s, v11.4s\n" + "ldr q19, [x27, x23]\n" + "ldr x21, [%[inptrs], 96]\n" + "ldr x20, [%[inptrs], 64]\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v8.4s, v18.4s, v6.4s\n" + "ldr x22, [%[inptrs], 168]\n" + "fmla v10.4s, v18.4s, v1.4s\n" + "ldr q13, [x21, x23]\n" + "fmla v9.4s, v17.4s, v6.4s\n" + "ldr q18, [x20, x23]\n" + "fmla v7.4s, v13.4s, v12.4s\n" + "ldr q17, [x19, x23]\n" + "fmla v8.4s, v15.4s, v2.4s\n" + "ldr q15, [x22, x23]\n" + "fmla v10.4s, v14.4s, v3.4s\n" + "ldr x27, [%[inptrs], 136]\n" + "fmla v9.4s, v13.4s, v2.4s\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr q16, [x27, x23]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v8.4s, v19.4s, v4.4s\n" + "ldr q19, [x21, x23]\n" + "fmla v10.4s, v13.4s, v0.4s\n" + "ldr q12, [x20, x23]\n" + "fmla v9.4s, v18.4s, v4.4s\n" + "ldr x22, [%[inptrs], 176]\n" + "fmla v7.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 144]\n" + "fmla v8.4s, v13.4s, v5.4s\n" + "ldr q11, [x22, x23]\n" + "ldr q13, [x27, x23]\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v9.4s, v17.4s, v5.4s\n" + "ldr x22, [%[inptrs], 184]\n" + "fmla v7.4s, v19.4s, v6.4s\n" + "ldr q14, [x21, x23]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr q17, [x22, x23]\n" + "ldr x27, [%[inptrs], 152]\n" + "ldr x22, [%[inptrs], 192]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v9.4s, v19.4s, v1.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str q10, [x21, x24]\n" + "fmla v7.4s, v11.4s, v2.4s\n" + "fmla v8.4s, v16.4s, v3.4s\n" + "ldr q16, [x27, x23]\n" + "ldr q15, [x22, x23]\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v9.4s, v12.4s, v3.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v7.4s, v13.4s, v4.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v8.4s, v11.4s, v0.4s\n" + "add x23, x23, #16\n" + "fmla v9.4s, v14.4s, v0.4s\n" + "fmla v7.4s, v14.4s, v5.4s\n" + "str q8, [x28, x24]\n" + "ldr x28, [%[outptrs], 24]\n" + "str q9, [x21, x24]\n" + "fmla v7.4s, v17.4s, v1.4s\n" + "fmla v7.4s, v16.4s, v3.4s\n" + "fmla v7.4s, v15.4s, v0.4s\n" + "str q7, [x28, x24]\n" + "add x24, x24, #16\n" + "4:\n" + "cbz x25, 7f\n" + "ldr s13, [%[wbptr]]\n" + "mov v10.16b, v13.16b\n" + "ldr s12, [%[wbptr], #4]\n" + "mov v8.16b, v13.16b\n" + "ldr s6, [%[wbptr], #8]\n" + "mov v9.16b, v13.16b\n" + "ldr s5, [%[wbptr], #12]\n" + "mov v7.16b, v13.16b\n" + "ldr s11, [%[wbptr], #16]\n" + "ldr s4, [%[wbptr], #20]\n" + "ldr x19, [%[inptrs], 0]\n" + "ldr s3, [%[wbptr], #24]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr s2, [%[wbptr], #28]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr s1, [%[wbptr], #32]\n" + "ldr x27, [%[inptrs], 120]\n" + "ldr s0, [%[wbptr], #36]\n" + "subs x25, x25, #1\n" + "ldr s14, [x19, x23]\n" + "ldr s18, [x20, x23]\n" + "fmla v10.4s, v14.4s, v12.4s\n" + "ldr s14, [x21, x23]\n" + "ldr s16, [x27, x23]\n" + "ldr x19, [%[inptrs], 8]\n" + "ldr x20, [%[inptrs], 48]\n" + "ldr x21, [%[inptrs], 88]\n" + "ldr s19, [x19, x23]\n" + "fmla v10.4s, v18.4s, v11.4s\n" + "ldr s15, [x20, x23]\n" + "ldr s18, [x21, x23]\n" + "ldr x19, [%[inptrs], 16]\n" + "ldr s13, [x19, x23]\n" + "fmla v10.4s, v19.4s, v6.4s\n" + "fmla v10.4s, v14.4s, v2.4s\n" + "beq 6f\n" + "5:\n" + "fmla v8.4s, v14.4s, v12.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v10.4s, v15.4s, v4.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v9.4s, v13.4s, v12.4s\n" + "ldr s14, [x20, x23]\n" + "ldr s17, [x19, x23]\n" + "ldr x22, [%[inptrs], 160]\n" + "fmla v8.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 128]\n" + "fmla v10.4s, v13.4s, v5.4s\n" + "ldr s15, [x22, x23]\n" + "fmla v9.4s, v14.4s, v11.4s\n" + "ldr s19, [x27, x23]\n" + "ldr x21, [%[inptrs], 96]\n" + "ldr x20, [%[inptrs], 64]\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v8.4s, v18.4s, v6.4s\n" + "ldr x22, [%[inptrs], 168]\n" + "fmla v10.4s, v18.4s, v1.4s\n" + "ldr s13, [x21, x23]\n" + "fmla v9.4s, v17.4s, v6.4s\n" + "ldr s18, [x20, x23]\n" + "fmla v7.4s, v13.4s, v12.4s\n" + "ldr s17, [x19, x23]\n" + "fmla v8.4s, v15.4s, v2.4s\n" + "ldr s15, [x22, x23]\n" + "fmla v10.4s, v14.4s, v3.4s\n" + "ldr x27, [%[inptrs], 136]\n" + "fmla v9.4s, v13.4s, v2.4s\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr s16, [x27, x23]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v8.4s, v19.4s, v4.4s\n" + "ldr s19, [x21, x23]\n" + "fmla v10.4s, v13.4s, v0.4s\n" + "ldr s12, [x20, x23]\n" + "fmla v9.4s, v18.4s, v4.4s\n" + "ldr x22, [%[inptrs], 176]\n" + "fmla v7.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 144]\n" + "fmla v8.4s, v13.4s, v5.4s\n" + "ldr s11, [x22, x23]\n" + "ldr s13, [x27, x23]\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v9.4s, v17.4s, v5.4s\n" + "ldr x22, [%[inptrs], 184]\n" + "fmla v7.4s, v19.4s, v6.4s\n" + "ldr s14, [x21, x23]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr s17, [x22, x23]\n" + "ldr x27, [%[inptrs], 152]\n" + "ldr x22, [%[inptrs], 192]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v9.4s, v19.4s, v1.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str s10, [x21, x24]\n" + "fmla v7.4s, v11.4s, v2.4s\n" + "fmla v8.4s, v16.4s, v3.4s\n" + "ldr s16, [x27, x23]\n" + "ldr s15, [x22, x23]\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v9.4s, v12.4s, v3.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v7.4s, v13.4s, v4.4s\n" + "ldr s13, [%[wbptr]]\n" + "fmla v8.4s, v11.4s, v0.4s\n" + "ldr s12, [%[wbptr], #4]\n" + "mov v10.16b, v13.16b\n" + "ldr s6, [%[wbptr], #8]\n" + "fmla v9.4s, v14.4s, v0.4s\n" + "ldr s11, [%[wbptr], #16]\n" + "fmla v7.4s, v14.4s, v5.4s\n" + "ldr s4, [%[wbptr], #20]\n" + "str s8, [x28, x24]\n" + "add x23, x23, #4\n" + "mov v8.16b, v13.16b\n" + "ldr s2, [%[wbptr], #28]\n" + "str s9, [x21, x24]\n" + "ldr x28, [%[outptrs], 24]\n" + "fmla v7.4s, v17.4s, v1.4s\n" + "ldr s5, [%[wbptr], #12]\n" + "mov v9.16b, v13.16b\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "ldr x19, [%[inptrs], 0]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr x27, [%[inptrs], 120]\n" + "subs x25, x25, #1\n" + "fmla v7.4s, v16.4s, v3.4s\n" + "ldr s1, [%[wbptr], #32]\n" + "ldr s14, [x19, x23]\n" + "fmla v10.4s, v14.4s, v12.4s\n" + "ldr s18, [x20, x23]\n" + "ldr s14, [x21, x23]\n" + "ldr x19, [%[inptrs], 8]\n" + "fmla v7.4s, v15.4s, v0.4s\n" + "ldr s3, [%[wbptr], #24]\n" + "ldr s19, [x19, x23]\n" + "ldr x20, [%[inptrs], 48]\n" + "fmla v10.4s, v18.4s, v11.4s\n" + "ldr s16, [x27, x23]\n" + "ldr s15, [x20, x23]\n" + "ldr x19, [%[inptrs], 16]\n" + "str s7, [x28, x24]\n" + "ldr x21, [%[inptrs], 88]\n" + "mov v7.16b, v13.16b\n" + "ldr s0, [%[wbptr], #36]\n" + "fmla v10.4s, v19.4s, v6.4s\n" + "ldr s13, [x19, x23]\n" + "ldr s18, [x21, x23]\n" + "add x24, x24, #4\n" + "fmla v10.4s, v14.4s, v2.4s\n" + "bne 5b\n" + "6:\n" + "fmla v8.4s, v14.4s, v12.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v10.4s, v15.4s, v4.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v9.4s, v13.4s, v12.4s\n" + "ldr s14, [x20, x23]\n" + "ldr s17, [x19, x23]\n" + "ldr x22, [%[inptrs], 160]\n" + "fmla v8.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 128]\n" + "fmla v10.4s, v13.4s, v5.4s\n" + "ldr s15, [x22, x23]\n" + "fmla v9.4s, v14.4s, v11.4s\n" + "ldr s19, [x27, x23]\n" + "ldr x21, [%[inptrs], 96]\n" + "ldr x20, [%[inptrs], 64]\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v8.4s, v18.4s, v6.4s\n" + "ldr x22, [%[inptrs], 168]\n" + "fmla v10.4s, v18.4s, v1.4s\n" + "ldr s13, [x21, x23]\n" + "fmla v9.4s, v17.4s, v6.4s\n" + "ldr s18, [x20, x23]\n" + "fmla v7.4s, v13.4s, v12.4s\n" + "ldr s17, [x19, x23]\n" + "fmla v8.4s, v15.4s, v2.4s\n" + "ldr s15, [x22, x23]\n" + "fmla v10.4s, v14.4s, v3.4s\n" + "ldr x27, [%[inptrs], 136]\n" + "fmla v9.4s, v13.4s, v2.4s\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr s16, [x27, x23]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v8.4s, v19.4s, v4.4s\n" + "ldr s19, [x21, x23]\n" + "fmla v10.4s, v13.4s, v0.4s\n" + "ldr s12, [x20, x23]\n" + "fmla v9.4s, v18.4s, v4.4s\n" + "ldr x22, [%[inptrs], 176]\n" + "fmla v7.4s, v16.4s, v11.4s\n" + "ldr x27, [%[inptrs], 144]\n" + "fmla v8.4s, v13.4s, v5.4s\n" + "ldr s11, [x22, x23]\n" + "ldr s13, [x27, x23]\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v9.4s, v17.4s, v5.4s\n" + "ldr x22, [%[inptrs], 184]\n" + "fmla v7.4s, v19.4s, v6.4s\n" + "ldr s14, [x21, x23]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr s17, [x22, x23]\n" + "ldr x27, [%[inptrs], 152]\n" + "ldr x22, [%[inptrs], 192]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v9.4s, v19.4s, v1.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str s10, [x21, x24]\n" + "fmla v7.4s, v11.4s, v2.4s\n" + "fmla v8.4s, v16.4s, v3.4s\n" + "ldr s16, [x27, x23]\n" + "ldr s15, [x22, x23]\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v9.4s, v12.4s, v3.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v7.4s, v13.4s, v4.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v8.4s, v11.4s, v0.4s\n" + "add x23, x23, #4\n" + "fmla v9.4s, v14.4s, v0.4s\n" + "fmla v7.4s, v14.4s, v5.4s\n" + "str s8, [x28, x24]\n" + "ldr x28, [%[outptrs], 24]\n" + "str s9, [x21, x24]\n" + "fmla v7.4s, v17.4s, v1.4s\n" + "fmla v7.4s, v16.4s, v3.4s\n" + "fmla v7.4s, v15.4s, v0.4s\n" + "str s7, [x28, x24]\n" + "add x24, x24, #4\n" + "7:\n" + : [wbptr] "+r" (weight_bias_ptr) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs) + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + ); +} + template <> template <> void Conv::execute_tile( @@ -848,6 +1333,511 @@ void Conv::execute_tile( ); } +template <> +template <> +void Conv::execute_tile( + int n_channels, + const void *weight_bias_ptr, + const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + float *outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + __asm __volatile( + "mov x22, xzr\n" + "mov x26, xzr\n" + "and x23, %[n_channels], #3\n" + "lsr x24, %[n_channels], #2\n" + "cbz x24, 4f\n" + "1:\n" + "ldr q14, [%[wbptr]]\n" + "ldr x19, [%[inptrs], 0]\n" + "mov v3.16b, v14.16b\n" + "ldr q13, [%[wbptr], #16]\n" + "mov v1.16b, v14.16b\n" + "ldr q11, [%[wbptr], #32]\n" + "mov v2.16b, v14.16b\n" + "ldr q4, [%[wbptr], #48]\n" + "mov v0.16b, v14.16b\n" + "ldr q12, [%[wbptr], #64]\n" + "ldr q9, [%[wbptr], #80]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr q8, [%[wbptr], #96]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr q7, [%[wbptr], #112]\n" + "ldr x25, [%[inptrs], 120]\n" + "ldr q6, [%[wbptr], #128]\n" + "subs x24, x24, #1\n" + "ldr q5, [%[wbptr], #144]\n" + "ldr q15, [x19, x22]\n" + "fmla v3.4s, v15.4s, v13.4s\n" + "ldr q17, [x20, x22]\n" + "ldr q16, [x21, x22]\n" + "ldr x19, [%[inptrs], 8]\n" + "ldr q15, [x25, x22]\n" + "ldr x20, [%[inptrs], 48]\n" + "ldr q10, [x19, x22]\n" + "ldr x21, [%[inptrs], 88]\n" + "fmla v3.4s, v17.4s, v12.4s\n" + "ldr q17, [x20, x22]\n" + "ldr q14, [x21, x22]\n" + "ldr x19, [%[inptrs], 16]\n" + "ldr q18, [x19, x22]\n" + "fmla v3.4s, v10.4s, v11.4s\n" + "fmla v3.4s, v16.4s, v7.4s\n" + "beq 3f\n" + "2:\n" + "fmla v1.4s, v16.4s, v13.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v3.4s, v17.4s, v9.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v2.4s, v18.4s, v13.4s\n" + "ldr q16, [x20, x22]\n" + "movi v10.16b, #0\n" + "ldr q17, [x19, x22]\n" + "fmla v1.4s, v15.4s, v12.4s\n" + "ldr x27, [%[inptrs], 160]\n" + "fmla v3.4s, v18.4s, v4.4s\n" + "ldr x25, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v12.4s\n" + "ldr q18, [x27, x22]\n" + "ldr q15, [x25, x22]\n" + "ldr x21, [%[inptrs], 96]\n" + "fmla v1.4s, v14.4s, v11.4s\n" + "ldr x20, [%[inptrs], 64]\n" + "fmla v3.4s, v14.4s, v6.4s\n" + "ldr q14, [x21, x22]\n" + "fmla v2.4s, v17.4s, v11.4s\n" + "ldr q17, [x20, x22]\n" + "fmla v0.4s, v14.4s, v13.4s\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v1.4s, v18.4s, v7.4s\n" + "ldr x27, [%[inptrs], 168]\n" + "fmla v3.4s, v16.4s, v8.4s\n" + "ldr q18, [x19, x22]\n" + "fmla v2.4s, v14.4s, v7.4s\n" + "ldr q13, [x27, x22]\n" + "ldr x25, [%[inptrs], 136]\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v1.4s, v15.4s, v9.4s\n" + "ldr x27, [%[inptrs], 176]\n" + "fmla v3.4s, v14.4s, v5.4s\n" + "ldr q16, [x25, x22]\n" + "fmla v2.4s, v17.4s, v9.4s\n" + "ldr q17, [x21, x22]\n" + "fmla v0.4s, v16.4s, v12.4s\n" + "ldr q12, [x20, x22]\n" + "fmla v1.4s, v14.4s, v4.4s\n" + "ldr q15, [x27, x22]\n" + "fmax v3.4s, v3.4s, v10.4s\n" + "ldr x25, [%[inptrs], 144]\n" + "fmla v2.4s, v18.4s, v4.4s\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v0.4s, v17.4s, v11.4s\n" + "ldr q14, [x25, x22]\n" + "fmla v1.4s, v13.4s, v6.4s\n" + "ldr q11, [x21, x22]\n" + "ldr x27, [%[inptrs], 184]\n" + "ldr x25, [%[inptrs], 152]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str q3, [x21, x26]\n" + "fmla v0.4s, v15.4s, v7.4s\n" + "fmla v1.4s, v16.4s, v8.4s\n" + "ldr q18, [x27, x22]\n" + "ldr q17, [x25, x22]\n" + "ldr x27, [%[inptrs], 192]\n" + "fmla v2.4s, v12.4s, v8.4s\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v0.4s, v14.4s, v9.4s\n" + "ldr q16, [x27, x22]\n" + "fmla v1.4s, v15.4s, v5.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "ldr q14, [%[wbptr]]\n" + "add x22, x22, #16\n" + "fmla v2.4s, v11.4s, v5.4s\n" + "ldr q13, [%[wbptr], #16]\n" + "fmla v0.4s, v11.4s, v4.4s\n" + "ldr q11, [%[wbptr], #32]\n" + "fmax v1.4s, v1.4s, v10.4s\n" + "ldr q12, [%[wbptr], #64]\n" + "mov v3.16b, v14.16b\n" + "ldr q9, [%[wbptr], #80]\n" + "fmax v2.4s, v2.4s, v10.4s\n" + "ldr q7, [%[wbptr], #112]\n" + "str q1, [x28, x26]\n" + "fmla v0.4s, v18.4s, v6.4s\n" + "mov v1.16b, v14.16b\n" + "ldr q4, [%[wbptr], #48]\n" + "str q2, [x21, x26]\n" + "ldr x28, [%[outptrs], 24]\n" + "mov v2.16b, v14.16b\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v0.4s, v17.4s, v8.4s\n" + "ldr q6, [%[wbptr], #128]\n" + "ldr x19, [%[inptrs], 0]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr x25, [%[inptrs], 120]\n" + "subs x24, x24, #1\n" + "ldr q15, [x19, x22]\n" + "fmla v0.4s, v16.4s, v5.4s\n" + "ldr q8, [%[wbptr], #96]\n" + "fmla v3.4s, v15.4s, v13.4s\n" + "ldr q17, [x20, x22]\n" + "ldr q16, [x21, x22]\n" + "ldr x19, [%[inptrs], 8]\n" + "ldr q15, [x25, x22]\n" + "ldr x20, [%[inptrs], 48]\n" + "fmax v0.4s, v0.4s, v10.4s\n" + "ldr q5, [%[wbptr], #144]\n" + "fmla v3.4s, v17.4s, v12.4s\n" + "ldr q10, [x19, x22]\n" + "ldr q17, [x20, x22]\n" + "ldr x19, [%[inptrs], 16]\n" + "str q0, [x28, x26]\n" + "ldr x21, [%[inptrs], 88]\n" + "mov v0.16b, v14.16b\n" + "ldr q18, [x19, x22]\n" + "fmla v3.4s, v10.4s, v11.4s\n" + "ldr q14, [x21, x22]\n" + "add x26, x26, #16\n" + "fmla v3.4s, v16.4s, v7.4s\n" + "bne 2b\n" + "3:\n" + "fmla v1.4s, v16.4s, v13.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v3.4s, v17.4s, v9.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v2.4s, v18.4s, v13.4s\n" + "ldr q16, [x20, x22]\n" + "movi v10.16b, #0\n" + "ldr q17, [x19, x22]\n" + "fmla v1.4s, v15.4s, v12.4s\n" + "ldr x27, [%[inptrs], 160]\n" + "fmla v3.4s, v18.4s, v4.4s\n" + "ldr x25, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v12.4s\n" + "ldr q18, [x27, x22]\n" + "ldr q15, [x25, x22]\n" + "ldr x21, [%[inptrs], 96]\n" + "fmla v1.4s, v14.4s, v11.4s\n" + "ldr x20, [%[inptrs], 64]\n" + "fmla v3.4s, v14.4s, v6.4s\n" + "ldr q14, [x21, x22]\n" + "fmla v2.4s, v17.4s, v11.4s\n" + "ldr q17, [x20, x22]\n" + "fmla v0.4s, v14.4s, v13.4s\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v1.4s, v18.4s, v7.4s\n" + "ldr x27, [%[inptrs], 168]\n" + "fmla v3.4s, v16.4s, v8.4s\n" + "ldr q18, [x19, x22]\n" + "fmla v2.4s, v14.4s, v7.4s\n" + "ldr q13, [x27, x22]\n" + "ldr x25, [%[inptrs], 136]\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v1.4s, v15.4s, v9.4s\n" + "ldr x27, [%[inptrs], 176]\n" + "fmla v3.4s, v14.4s, v5.4s\n" + "ldr q16, [x25, x22]\n" + "fmla v2.4s, v17.4s, v9.4s\n" + "ldr q17, [x21, x22]\n" + "fmla v0.4s, v16.4s, v12.4s\n" + "ldr q12, [x20, x22]\n" + "fmla v1.4s, v14.4s, v4.4s\n" + "ldr q15, [x27, x22]\n" + "fmax v3.4s, v3.4s, v10.4s\n" + "ldr x25, [%[inptrs], 144]\n" + "fmla v2.4s, v18.4s, v4.4s\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v0.4s, v17.4s, v11.4s\n" + "ldr q14, [x25, x22]\n" + "fmla v1.4s, v13.4s, v6.4s\n" + "ldr q11, [x21, x22]\n" + "ldr x27, [%[inptrs], 184]\n" + "ldr x25, [%[inptrs], 152]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str q3, [x21, x26]\n" + "fmla v0.4s, v15.4s, v7.4s\n" + "fmla v1.4s, v16.4s, v8.4s\n" + "ldr q18, [x27, x22]\n" + "ldr q17, [x25, x22]\n" + "ldr x27, [%[inptrs], 192]\n" + "fmla v2.4s, v12.4s, v8.4s\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v0.4s, v14.4s, v9.4s\n" + "ldr q16, [x27, x22]\n" + "fmla v1.4s, v15.4s, v5.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "add x22, x22, #16\n" + "fmla v2.4s, v11.4s, v5.4s\n" + "fmla v0.4s, v11.4s, v4.4s\n" + "fmax v1.4s, v1.4s, v10.4s\n" + "fmax v2.4s, v2.4s, v10.4s\n" + "str q1, [x28, x26]\n" + "fmla v0.4s, v18.4s, v6.4s\n" + "ldr x28, [%[outptrs], 24]\n" + "str q2, [x21, x26]\n" + "fmla v0.4s, v17.4s, v8.4s\n" + "fmla v0.4s, v16.4s, v5.4s\n" + "fmax v0.4s, v0.4s, v10.4s\n" + "str q0, [x28, x26]\n" + "add x26, x26, #16\n" + "4:\n" + "cbz x23, 7f\n" + "ldr s14, [%[wbptr]]\n" + "mov v3.16b, v14.16b\n" + "ldr s13, [%[wbptr], #4]\n" + "mov v1.16b, v14.16b\n" + "ldr s11, [%[wbptr], #8]\n" + "mov v2.16b, v14.16b\n" + "ldr s4, [%[wbptr], #12]\n" + "mov v0.16b, v14.16b\n" + "ldr s12, [%[wbptr], #16]\n" + "ldr s9, [%[wbptr], #20]\n" + "ldr x19, [%[inptrs], 0]\n" + "ldr s8, [%[wbptr], #24]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr s7, [%[wbptr], #28]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr s6, [%[wbptr], #32]\n" + "ldr x25, [%[inptrs], 120]\n" + "ldr s5, [%[wbptr], #36]\n" + "subs x23, x23, #1\n" + "ldr s15, [x19, x22]\n" + "ldr s17, [x20, x22]\n" + "fmla v3.4s, v15.4s, v13.4s\n" + "ldr s16, [x21, x22]\n" + "ldr s15, [x25, x22]\n" + "ldr x19, [%[inptrs], 8]\n" + "ldr x20, [%[inptrs], 48]\n" + "ldr x21, [%[inptrs], 88]\n" + "ldr s10, [x19, x22]\n" + "fmla v3.4s, v17.4s, v12.4s\n" + "ldr s17, [x20, x22]\n" + "ldr s14, [x21, x22]\n" + "ldr x19, [%[inptrs], 16]\n" + "ldr s18, [x19, x22]\n" + "fmla v3.4s, v10.4s, v11.4s\n" + "fmla v3.4s, v16.4s, v7.4s\n" + "beq 6f\n" + "5:\n" + "fmla v1.4s, v16.4s, v13.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v3.4s, v17.4s, v9.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v2.4s, v18.4s, v13.4s\n" + "ldr s16, [x20, x22]\n" + "movi v10.16b, #0\n" + "ldr s17, [x19, x22]\n" + "fmla v1.4s, v15.4s, v12.4s\n" + "ldr x27, [%[inptrs], 160]\n" + "fmla v3.4s, v18.4s, v4.4s\n" + "ldr x25, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v12.4s\n" + "ldr s18, [x27, x22]\n" + "ldr s15, [x25, x22]\n" + "ldr x21, [%[inptrs], 96]\n" + "fmla v1.4s, v14.4s, v11.4s\n" + "ldr x20, [%[inptrs], 64]\n" + "fmla v3.4s, v14.4s, v6.4s\n" + "ldr s14, [x21, x22]\n" + "fmla v2.4s, v17.4s, v11.4s\n" + "ldr s17, [x20, x22]\n" + "fmla v0.4s, v14.4s, v13.4s\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v1.4s, v18.4s, v7.4s\n" + "ldr x27, [%[inptrs], 168]\n" + "fmla v3.4s, v16.4s, v8.4s\n" + "ldr s18, [x19, x22]\n" + "fmla v2.4s, v14.4s, v7.4s\n" + "ldr s13, [x27, x22]\n" + "ldr x25, [%[inptrs], 136]\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v1.4s, v15.4s, v9.4s\n" + "ldr x27, [%[inptrs], 176]\n" + "fmla v3.4s, v14.4s, v5.4s\n" + "ldr s16, [x25, x22]\n" + "fmla v2.4s, v17.4s, v9.4s\n" + "ldr s17, [x21, x22]\n" + "fmla v0.4s, v16.4s, v12.4s\n" + "ldr s12, [x20, x22]\n" + "fmla v1.4s, v14.4s, v4.4s\n" + "ldr s15, [x27, x22]\n" + "fmax v3.4s, v3.4s, v10.4s\n" + "ldr x25, [%[inptrs], 144]\n" + "fmla v2.4s, v18.4s, v4.4s\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v0.4s, v17.4s, v11.4s\n" + "ldr s14, [x25, x22]\n" + "fmla v1.4s, v13.4s, v6.4s\n" + "ldr s11, [x21, x22]\n" + "ldr x27, [%[inptrs], 184]\n" + "ldr x25, [%[inptrs], 152]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str s3, [x21, x26]\n" + "fmla v0.4s, v15.4s, v7.4s\n" + "fmla v1.4s, v16.4s, v8.4s\n" + "ldr s18, [x27, x22]\n" + "ldr s17, [x25, x22]\n" + "ldr x27, [%[inptrs], 192]\n" + "fmla v2.4s, v12.4s, v8.4s\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v0.4s, v14.4s, v9.4s\n" + "ldr s16, [x27, x22]\n" + "fmla v1.4s, v15.4s, v5.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "ldr s14, [%[wbptr]]\n" + "add x22, x22, #4\n" + "fmla v2.4s, v11.4s, v5.4s\n" + "ldr s13, [%[wbptr], #4]\n" + "fmla v0.4s, v11.4s, v4.4s\n" + "ldr s11, [%[wbptr], #8]\n" + "fmax v1.4s, v1.4s, v10.4s\n" + "ldr s12, [%[wbptr], #16]\n" + "mov v3.16b, v14.16b\n" + "ldr s9, [%[wbptr], #20]\n" + "fmax v2.4s, v2.4s, v10.4s\n" + "ldr s7, [%[wbptr], #28]\n" + "str s1, [x28, x26]\n" + "fmla v0.4s, v18.4s, v6.4s\n" + "mov v1.16b, v14.16b\n" + "ldr s4, [%[wbptr], #12]\n" + "str s2, [x21, x26]\n" + "ldr x28, [%[outptrs], 24]\n" + "mov v2.16b, v14.16b\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v0.4s, v17.4s, v8.4s\n" + "ldr s6, [%[wbptr], #32]\n" + "ldr x19, [%[inptrs], 0]\n" + "ldr x20, [%[inptrs], 40]\n" + "ldr x21, [%[inptrs], 80]\n" + "ldr x25, [%[inptrs], 120]\n" + "subs x23, x23, #1\n" + "ldr s15, [x19, x22]\n" + "fmla v0.4s, v16.4s, v5.4s\n" + "ldr s8, [%[wbptr], #24]\n" + "fmla v3.4s, v15.4s, v13.4s\n" + "ldr s17, [x20, x22]\n" + "ldr s16, [x21, x22]\n" + "ldr x19, [%[inptrs], 8]\n" + "ldr s15, [x25, x22]\n" + "ldr x20, [%[inptrs], 48]\n" + "fmax v0.4s, v0.4s, v10.4s\n" + "ldr s5, [%[wbptr], #36]\n" + "fmla v3.4s, v17.4s, v12.4s\n" + "ldr s10, [x19, x22]\n" + "ldr s17, [x20, x22]\n" + "ldr x19, [%[inptrs], 16]\n" + "str s0, [x28, x26]\n" + "ldr x21, [%[inptrs], 88]\n" + "mov v0.16b, v14.16b\n" + "ldr s18, [x19, x22]\n" + "fmla v3.4s, v10.4s, v11.4s\n" + "ldr s14, [x21, x22]\n" + "add x26, x26, #4\n" + "fmla v3.4s, v16.4s, v7.4s\n" + "bne 5b\n" + "6:\n" + "fmla v1.4s, v16.4s, v13.4s\n" + "ldr x20, [%[inptrs], 56]\n" + "fmla v3.4s, v17.4s, v9.4s\n" + "ldr x19, [%[inptrs], 24]\n" + "fmla v2.4s, v18.4s, v13.4s\n" + "ldr s16, [x20, x22]\n" + "movi v10.16b, #0\n" + "ldr s17, [x19, x22]\n" + "fmla v1.4s, v15.4s, v12.4s\n" + "ldr x27, [%[inptrs], 160]\n" + "fmla v3.4s, v18.4s, v4.4s\n" + "ldr x25, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v12.4s\n" + "ldr s18, [x27, x22]\n" + "ldr s15, [x25, x22]\n" + "ldr x21, [%[inptrs], 96]\n" + "fmla v1.4s, v14.4s, v11.4s\n" + "ldr x20, [%[inptrs], 64]\n" + "fmla v3.4s, v14.4s, v6.4s\n" + "ldr s14, [x21, x22]\n" + "fmla v2.4s, v17.4s, v11.4s\n" + "ldr s17, [x20, x22]\n" + "fmla v0.4s, v14.4s, v13.4s\n" + "ldr x19, [%[inptrs], 32]\n" + "fmla v1.4s, v18.4s, v7.4s\n" + "ldr x27, [%[inptrs], 168]\n" + "fmla v3.4s, v16.4s, v8.4s\n" + "ldr s18, [x19, x22]\n" + "fmla v2.4s, v14.4s, v7.4s\n" + "ldr s13, [x27, x22]\n" + "ldr x25, [%[inptrs], 136]\n" + "ldr x21, [%[inptrs], 104]\n" + "ldr x20, [%[inptrs], 72]\n" + "fmla v1.4s, v15.4s, v9.4s\n" + "ldr x27, [%[inptrs], 176]\n" + "fmla v3.4s, v14.4s, v5.4s\n" + "ldr s16, [x25, x22]\n" + "fmla v2.4s, v17.4s, v9.4s\n" + "ldr s17, [x21, x22]\n" + "fmla v0.4s, v16.4s, v12.4s\n" + "ldr s12, [x20, x22]\n" + "fmla v1.4s, v14.4s, v4.4s\n" + "ldr s15, [x27, x22]\n" + "fmax v3.4s, v3.4s, v10.4s\n" + "ldr x25, [%[inptrs], 144]\n" + "fmla v2.4s, v18.4s, v4.4s\n" + "ldr x21, [%[inptrs], 112]\n" + "fmla v0.4s, v17.4s, v11.4s\n" + "ldr s14, [x25, x22]\n" + "fmla v1.4s, v13.4s, v6.4s\n" + "ldr s11, [x21, x22]\n" + "ldr x27, [%[inptrs], 184]\n" + "ldr x25, [%[inptrs], 152]\n" + "ldr x21, [%[outptrs], 0]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr x28, [%[outptrs], 16]\n" + "str s3, [x21, x26]\n" + "fmla v0.4s, v15.4s, v7.4s\n" + "fmla v1.4s, v16.4s, v8.4s\n" + "ldr s18, [x27, x22]\n" + "ldr s17, [x25, x22]\n" + "ldr x27, [%[inptrs], 192]\n" + "fmla v2.4s, v12.4s, v8.4s\n" + "ldr x21, [%[outptrs], 8]\n" + "fmla v0.4s, v14.4s, v9.4s\n" + "ldr s16, [x27, x22]\n" + "fmla v1.4s, v15.4s, v5.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "add x22, x22, #4\n" + "fmla v2.4s, v11.4s, v5.4s\n" + "fmla v0.4s, v11.4s, v4.4s\n" + "fmax v1.4s, v1.4s, v10.4s\n" + "fmax v2.4s, v2.4s, v10.4s\n" + "str s1, [x28, x26]\n" + "fmla v0.4s, v18.4s, v6.4s\n" + "ldr x28, [%[outptrs], 24]\n" + "str s2, [x21, x26]\n" + "fmla v0.4s, v17.4s, v8.4s\n" + "fmla v0.4s, v16.4s, v5.4s\n" + "fmax v0.4s, v0.4s, v10.4s\n" + "str s0, [x28, x26]\n" + "add x26, x26, #4\n" + "7:\n" + : [wbptr] "+r" (weight_bias_ptr) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs) + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + ); +} + template <> template <> void Conv::execute_tile( @@ -1287,6 +2277,531 @@ void Conv::execute_tile( ); } +template <> +template <> +void Conv::execute_tile( + int n_channels, + const void *weight_bias_ptr, + const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + float *outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + __asm __volatile( + "mov x27, xzr\n" + "mov x28, xzr\n" + "and x26, %[n_channels], #3\n" + "lsr x25, %[n_channels], #2\n" + "cbz x25, 4f\n" + "1:\n" + "ldr q15, [%[wbptr]]\n" + "ldr x21, [%[inptrs], 0]\n" + "mov v8.16b, v15.16b\n" + "ldr q14, [%[wbptr], #16]\n" + "mov v3.16b, v15.16b\n" + "ldr q10, [%[wbptr], #32]\n" + "mov v2.16b, v15.16b\n" + "ldr q7, [%[wbptr], #48]\n" + "mov v4.16b, v15.16b\n" + "ldr q13, [%[wbptr], #64]\n" + "ldr q5, [%[wbptr], #80]\n" + "ldr x22, [%[inptrs], 40]\n" + "ldr q0, [%[wbptr], #96]\n" + "ldr x20, [%[inptrs], 80]\n" + "ldr q9, [%[wbptr], #112]\n" + "ldr x23, [%[inptrs], 120]\n" + "ldr q6, [%[wbptr], #128]\n" + "subs x25, x25, #1\n" + "ldr q1, [%[wbptr], #144]\n" + "ldr q17, [x21, x27]\n" + "fmla v8.4s, v17.4s, v14.4s\n" + "ldr q18, [x22, x27]\n" + "ldr q16, [x20, x27]\n" + "ldr x21, [%[inptrs], 8]\n" + "ldr q17, [x23, x27]\n" + "ldr x22, [%[inptrs], 48]\n" + "ldr q11, [x21, x27]\n" + "ldr x20, [%[inptrs], 88]\n" + "fmla v8.4s, v18.4s, v13.4s\n" + "ldr q19, [x22, x27]\n" + "ldr q15, [x20, x27]\n" + "ldr x21, [%[inptrs], 16]\n" + "ldr q12, [x21, x27]\n" + "fmla v8.4s, v11.4s, v10.4s\n" + "fmla v8.4s, v16.4s, v9.4s\n" + "beq 3f\n" + "2:\n" + "fmla v3.4s, v16.4s, v14.4s\n" + "ldr x22, [%[inptrs], 56]\n" + "fmla v8.4s, v19.4s, v5.4s\n" + "ldr x21, [%[inptrs], 24]\n" + "fmla v2.4s, v12.4s, v14.4s\n" + "ldr q16, [x22, x27]\n" + "movi v11.16b, #0\n" + "ldr q18, [x21, x27]\n" + "fmla v3.4s, v17.4s, v13.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v8.4s, v12.4s, v7.4s\n" + "ldr x23, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v13.4s\n" + "ldr q19, [x20, x27]\n" + "fmov v12.4s, #6.0\n" + "ldr q17, [x23, x27]\n" + "fmla v3.4s, v15.4s, v10.4s\n" + "ldr x20, [%[inptrs], 96]\n" + "fmla v8.4s, v15.4s, v6.4s\n" + "ldr x22, [%[inptrs], 64]\n" + "fmla v2.4s, v18.4s, v10.4s\n" + "ldr q15, [x20, x27]\n" + "fmla v4.4s, v15.4s, v14.4s\n" + "ldr q18, [x22, x27]\n" + "fmla v3.4s, v19.4s, v9.4s\n" + "ldr x21, [%[inptrs], 32]\n" + "fmla v8.4s, v16.4s, v0.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v2.4s, v15.4s, v9.4s\n" + "ldr q19, [x21, x27]\n" + "ldr q16, [x20, x27]\n" + "ldr x23, [%[inptrs], 136]\n" + "fmla v3.4s, v17.4s, v5.4s\n" + "ldr x20, [%[inptrs], 104]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr q14, [x23, x27]\n" + "fmla v2.4s, v18.4s, v5.4s\n" + "ldr q17, [x20, x27]\n" + "fmla v4.4s, v14.4s, v13.4s\n" + "ldr x22, [%[inptrs], 72]\n" + "fmla v3.4s, v15.4s, v7.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmax v8.4s, v8.4s, v11.4s\n" + "ldr q18, [x22, x27]\n" + "fmla v2.4s, v19.4s, v7.4s\n" + "ldr q13, [x20, x27]\n" + "fmla v4.4s, v17.4s, v10.4s\n" + "ldr x23, [%[inptrs], 144]\n" + "fmla v3.4s, v16.4s, v6.4s\n" + "ldr x20, [%[inptrs], 112]\n" + "fmin v8.4s, v8.4s, v12.4s\n" + "ldr q10, [x23, x27]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr q15, [x20, x27]\n" + "fmla v4.4s, v13.4s, v9.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v3.4s, v14.4s, v0.4s\n" + "ldr x23, [%[inptrs], 152]\n" + "ldr q9, [x20, x27]\n" + "ldr x22, [%[outptrs], 0]\n" + "fmla v2.4s, v18.4s, v0.4s\n" + "ldr q19, [x23, x27]\n" + "str q8, [x22, x28]\n" + "fmla v4.4s, v10.4s, v5.4s\n" + "fmla v3.4s, v13.4s, v1.4s\n" + "ldr x20, [%[inptrs], 192]\n" + "ldr x22, [%[outptrs], 8]\n" + "ldr x24, [%[outptrs], 16]\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v2.4s, v15.4s, v1.4s\n" + "ldr q16, [x20, x27]\n" + "fmla v4.4s, v15.4s, v7.4s\n" + "ldr q15, [%[wbptr]]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "ldr q14, [%[wbptr], #16]\n" + "mov v8.16b, v15.16b\n" + "ldr q10, [%[wbptr], #32]\n" + "fmax v2.4s, v2.4s, v11.4s\n" + "ldr q13, [%[wbptr], #64]\n" + "fmla v4.4s, v9.4s, v6.4s\n" + "ldr q7, [%[wbptr], #48]\n" + "fmin v3.4s, v3.4s, v12.4s\n" + "ldr q5, [%[wbptr], #80]\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "ldr q9, [%[wbptr], #112]\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "add x27, x27, #16\n" + "str q3, [x24, x28]\n" + "fmla v4.4s, v19.4s, v0.4s\n" + "str q2, [x22, x28]\n" + "mov v3.16b, v15.16b\n" + "mov v2.16b, v15.16b\n" + "ldr q6, [%[wbptr], #128]\n" + "ldr x24, [%[outptrs], 24]\n" + "ldr x21, [%[inptrs], 0]\n" + "ldr x22, [%[inptrs], 40]\n" + "fmla v4.4s, v16.4s, v1.4s\n" + "ldr q0, [%[wbptr], #96]\n" + "ldr q17, [x21, x27]\n" + "ldr x20, [%[inptrs], 80]\n" + "fmla v8.4s, v17.4s, v14.4s\n" + "ldr q18, [x22, x27]\n" + "ldr q16, [x20, x27]\n" + "ldr x21, [%[inptrs], 8]\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "ldr q1, [%[wbptr], #144]\n" + "ldr q11, [x21, x27]\n" + "ldr x22, [%[inptrs], 48]\n" + "fmla v8.4s, v18.4s, v13.4s\n" + "ldr x21, [%[inptrs], 16]\n" + "fmin v4.4s, v4.4s, v12.4s\n" + "ldr q19, [x22, x27]\n" + "ldr q12, [x21, x27]\n" + "ldr x23, [%[inptrs], 120]\n" + "ldr x20, [%[inptrs], 88]\n" + "subs x25, x25, #1\n" + "str q4, [x24, x28]\n" + "mov v4.16b, v15.16b\n" + "ldr q17, [x23, x27]\n" + "fmla v8.4s, v11.4s, v10.4s\n" + "ldr q15, [x20, x27]\n" + "add x28, x28, #16\n" + "fmla v8.4s, v16.4s, v9.4s\n" + "bne 2b\n" + "3:\n" + "fmla v3.4s, v16.4s, v14.4s\n" + "ldr x22, [%[inptrs], 56]\n" + "fmla v8.4s, v19.4s, v5.4s\n" + "ldr x21, [%[inptrs], 24]\n" + "fmla v2.4s, v12.4s, v14.4s\n" + "ldr q16, [x22, x27]\n" + "movi v11.16b, #0\n" + "ldr q18, [x21, x27]\n" + "fmla v3.4s, v17.4s, v13.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v8.4s, v12.4s, v7.4s\n" + "ldr x23, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v13.4s\n" + "ldr q19, [x20, x27]\n" + "fmov v12.4s, #6.0\n" + "ldr q17, [x23, x27]\n" + "fmla v3.4s, v15.4s, v10.4s\n" + "ldr x20, [%[inptrs], 96]\n" + "fmla v8.4s, v15.4s, v6.4s\n" + "ldr x22, [%[inptrs], 64]\n" + "fmla v2.4s, v18.4s, v10.4s\n" + "ldr q15, [x20, x27]\n" + "fmla v4.4s, v15.4s, v14.4s\n" + "ldr q18, [x22, x27]\n" + "fmla v3.4s, v19.4s, v9.4s\n" + "ldr x21, [%[inptrs], 32]\n" + "fmla v8.4s, v16.4s, v0.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v2.4s, v15.4s, v9.4s\n" + "ldr q19, [x21, x27]\n" + "ldr q16, [x20, x27]\n" + "ldr x23, [%[inptrs], 136]\n" + "fmla v3.4s, v17.4s, v5.4s\n" + "ldr x20, [%[inptrs], 104]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr q14, [x23, x27]\n" + "fmla v2.4s, v18.4s, v5.4s\n" + "ldr q17, [x20, x27]\n" + "fmla v4.4s, v14.4s, v13.4s\n" + "ldr x22, [%[inptrs], 72]\n" + "fmla v3.4s, v15.4s, v7.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmax v8.4s, v8.4s, v11.4s\n" + "ldr q18, [x22, x27]\n" + "fmla v2.4s, v19.4s, v7.4s\n" + "ldr q13, [x20, x27]\n" + "fmla v4.4s, v17.4s, v10.4s\n" + "ldr x23, [%[inptrs], 144]\n" + "fmla v3.4s, v16.4s, v6.4s\n" + "ldr x20, [%[inptrs], 112]\n" + "fmin v8.4s, v8.4s, v12.4s\n" + "ldr q10, [x23, x27]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr q15, [x20, x27]\n" + "fmla v4.4s, v13.4s, v9.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v3.4s, v14.4s, v0.4s\n" + "ldr x23, [%[inptrs], 152]\n" + "ldr q9, [x20, x27]\n" + "ldr x22, [%[outptrs], 0]\n" + "fmla v2.4s, v18.4s, v0.4s\n" + "ldr q19, [x23, x27]\n" + "str q8, [x22, x28]\n" + "fmla v4.4s, v10.4s, v5.4s\n" + "fmla v3.4s, v13.4s, v1.4s\n" + "ldr x20, [%[inptrs], 192]\n" + "ldr x22, [%[outptrs], 8]\n" + "ldr x24, [%[outptrs], 16]\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v2.4s, v15.4s, v1.4s\n" + "ldr q16, [x20, x27]\n" + "fmla v4.4s, v15.4s, v7.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "add x27, x27, #16\n" + "fmax v2.4s, v2.4s, v11.4s\n" + "fmla v4.4s, v9.4s, v6.4s\n" + "fmin v3.4s, v3.4s, v12.4s\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "str q3, [x24, x28]\n" + "fmla v4.4s, v19.4s, v0.4s\n" + "str q2, [x22, x28]\n" + "ldr x24, [%[outptrs], 24]\n" + "fmla v4.4s, v16.4s, v1.4s\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "fmin v4.4s, v4.4s, v12.4s\n" + "str q4, [x24, x28]\n" + "add x28, x28, #16\n" + "4:\n" + "cbz x26, 7f\n" + "ldr s15, [%[wbptr]]\n" + "mov v8.16b, v15.16b\n" + "ldr s14, [%[wbptr], #4]\n" + "mov v3.16b, v15.16b\n" + "ldr s10, [%[wbptr], #8]\n" + "mov v2.16b, v15.16b\n" + "ldr s7, [%[wbptr], #12]\n" + "mov v4.16b, v15.16b\n" + "ldr s13, [%[wbptr], #16]\n" + "ldr s5, [%[wbptr], #20]\n" + "ldr x21, [%[inptrs], 0]\n" + "ldr s0, [%[wbptr], #24]\n" + "ldr x22, [%[inptrs], 40]\n" + "ldr s9, [%[wbptr], #28]\n" + "ldr x20, [%[inptrs], 80]\n" + "ldr s6, [%[wbptr], #32]\n" + "ldr x23, [%[inptrs], 120]\n" + "ldr s1, [%[wbptr], #36]\n" + "subs x26, x26, #1\n" + "ldr s17, [x21, x27]\n" + "ldr s18, [x22, x27]\n" + "fmla v8.4s, v17.4s, v14.4s\n" + "ldr s16, [x20, x27]\n" + "ldr s17, [x23, x27]\n" + "ldr x21, [%[inptrs], 8]\n" + "ldr x22, [%[inptrs], 48]\n" + "ldr x20, [%[inptrs], 88]\n" + "ldr s11, [x21, x27]\n" + "fmla v8.4s, v18.4s, v13.4s\n" + "ldr s19, [x22, x27]\n" + "ldr s15, [x20, x27]\n" + "ldr x21, [%[inptrs], 16]\n" + "ldr s12, [x21, x27]\n" + "fmla v8.4s, v11.4s, v10.4s\n" + "fmla v8.4s, v16.4s, v9.4s\n" + "beq 6f\n" + "5:\n" + "fmla v3.4s, v16.4s, v14.4s\n" + "ldr x22, [%[inptrs], 56]\n" + "fmla v8.4s, v19.4s, v5.4s\n" + "ldr x21, [%[inptrs], 24]\n" + "fmla v2.4s, v12.4s, v14.4s\n" + "ldr s16, [x22, x27]\n" + "movi v11.16b, #0\n" + "ldr s18, [x21, x27]\n" + "fmla v3.4s, v17.4s, v13.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v8.4s, v12.4s, v7.4s\n" + "ldr x23, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v13.4s\n" + "ldr s19, [x20, x27]\n" + "fmov v12.4s, #6.0\n" + "ldr s17, [x23, x27]\n" + "fmla v3.4s, v15.4s, v10.4s\n" + "ldr x20, [%[inptrs], 96]\n" + "fmla v8.4s, v15.4s, v6.4s\n" + "ldr x22, [%[inptrs], 64]\n" + "fmla v2.4s, v18.4s, v10.4s\n" + "ldr s15, [x20, x27]\n" + "fmla v4.4s, v15.4s, v14.4s\n" + "ldr s18, [x22, x27]\n" + "fmla v3.4s, v19.4s, v9.4s\n" + "ldr x21, [%[inptrs], 32]\n" + "fmla v8.4s, v16.4s, v0.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v2.4s, v15.4s, v9.4s\n" + "ldr s19, [x21, x27]\n" + "ldr s16, [x20, x27]\n" + "ldr x23, [%[inptrs], 136]\n" + "fmla v3.4s, v17.4s, v5.4s\n" + "ldr x20, [%[inptrs], 104]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr s14, [x23, x27]\n" + "fmla v2.4s, v18.4s, v5.4s\n" + "ldr s17, [x20, x27]\n" + "fmla v4.4s, v14.4s, v13.4s\n" + "ldr x22, [%[inptrs], 72]\n" + "fmla v3.4s, v15.4s, v7.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmax v8.4s, v8.4s, v11.4s\n" + "ldr s18, [x22, x27]\n" + "fmla v2.4s, v19.4s, v7.4s\n" + "ldr s13, [x20, x27]\n" + "fmla v4.4s, v17.4s, v10.4s\n" + "ldr x23, [%[inptrs], 144]\n" + "fmla v3.4s, v16.4s, v6.4s\n" + "ldr x20, [%[inptrs], 112]\n" + "fmin v8.4s, v8.4s, v12.4s\n" + "ldr s10, [x23, x27]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr s15, [x20, x27]\n" + "fmla v4.4s, v13.4s, v9.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v3.4s, v14.4s, v0.4s\n" + "ldr x23, [%[inptrs], 152]\n" + "ldr s9, [x20, x27]\n" + "ldr x22, [%[outptrs], 0]\n" + "fmla v2.4s, v18.4s, v0.4s\n" + "ldr s19, [x23, x27]\n" + "str s8, [x22, x28]\n" + "fmla v4.4s, v10.4s, v5.4s\n" + "fmla v3.4s, v13.4s, v1.4s\n" + "ldr x20, [%[inptrs], 192]\n" + "ldr x22, [%[outptrs], 8]\n" + "ldr x24, [%[outptrs], 16]\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v2.4s, v15.4s, v1.4s\n" + "ldr s16, [x20, x27]\n" + "fmla v4.4s, v15.4s, v7.4s\n" + "ldr s15, [%[wbptr]]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "ldr s14, [%[wbptr], #4]\n" + "mov v8.16b, v15.16b\n" + "ldr s10, [%[wbptr], #8]\n" + "fmax v2.4s, v2.4s, v11.4s\n" + "ldr s13, [%[wbptr], #16]\n" + "fmla v4.4s, v9.4s, v6.4s\n" + "ldr s7, [%[wbptr], #12]\n" + "fmin v3.4s, v3.4s, v12.4s\n" + "ldr s5, [%[wbptr], #20]\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "ldr s9, [%[wbptr], #28]\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "add x27, x27, #4\n" + "str s3, [x24, x28]\n" + "fmla v4.4s, v19.4s, v0.4s\n" + "str s2, [x22, x28]\n" + "mov v3.16b, v15.16b\n" + "mov v2.16b, v15.16b\n" + "ldr s6, [%[wbptr], #32]\n" + "ldr x24, [%[outptrs], 24]\n" + "ldr x21, [%[inptrs], 0]\n" + "ldr x22, [%[inptrs], 40]\n" + "fmla v4.4s, v16.4s, v1.4s\n" + "ldr s0, [%[wbptr], #24]\n" + "ldr s17, [x21, x27]\n" + "ldr x20, [%[inptrs], 80]\n" + "fmla v8.4s, v17.4s, v14.4s\n" + "ldr s18, [x22, x27]\n" + "ldr s16, [x20, x27]\n" + "ldr x21, [%[inptrs], 8]\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "ldr s1, [%[wbptr], #36]\n" + "ldr s11, [x21, x27]\n" + "ldr x22, [%[inptrs], 48]\n" + "fmla v8.4s, v18.4s, v13.4s\n" + "ldr x21, [%[inptrs], 16]\n" + "fmin v4.4s, v4.4s, v12.4s\n" + "ldr s19, [x22, x27]\n" + "ldr s12, [x21, x27]\n" + "ldr x23, [%[inptrs], 120]\n" + "ldr x20, [%[inptrs], 88]\n" + "subs x26, x26, #1\n" + "str s4, [x24, x28]\n" + "mov v4.16b, v15.16b\n" + "ldr s17, [x23, x27]\n" + "fmla v8.4s, v11.4s, v10.4s\n" + "ldr s15, [x20, x27]\n" + "add x28, x28, #4\n" + "fmla v8.4s, v16.4s, v9.4s\n" + "bne 5b\n" + "6:\n" + "fmla v3.4s, v16.4s, v14.4s\n" + "ldr x22, [%[inptrs], 56]\n" + "fmla v8.4s, v19.4s, v5.4s\n" + "ldr x21, [%[inptrs], 24]\n" + "fmla v2.4s, v12.4s, v14.4s\n" + "ldr s16, [x22, x27]\n" + "movi v11.16b, #0\n" + "ldr s18, [x21, x27]\n" + "fmla v3.4s, v17.4s, v13.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v8.4s, v12.4s, v7.4s\n" + "ldr x23, [%[inptrs], 128]\n" + "fmla v2.4s, v16.4s, v13.4s\n" + "ldr s19, [x20, x27]\n" + "fmov v12.4s, #6.0\n" + "ldr s17, [x23, x27]\n" + "fmla v3.4s, v15.4s, v10.4s\n" + "ldr x20, [%[inptrs], 96]\n" + "fmla v8.4s, v15.4s, v6.4s\n" + "ldr x22, [%[inptrs], 64]\n" + "fmla v2.4s, v18.4s, v10.4s\n" + "ldr s15, [x20, x27]\n" + "fmla v4.4s, v15.4s, v14.4s\n" + "ldr s18, [x22, x27]\n" + "fmla v3.4s, v19.4s, v9.4s\n" + "ldr x21, [%[inptrs], 32]\n" + "fmla v8.4s, v16.4s, v0.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v2.4s, v15.4s, v9.4s\n" + "ldr s19, [x21, x27]\n" + "ldr s16, [x20, x27]\n" + "ldr x23, [%[inptrs], 136]\n" + "fmla v3.4s, v17.4s, v5.4s\n" + "ldr x20, [%[inptrs], 104]\n" + "fmla v8.4s, v15.4s, v1.4s\n" + "ldr s14, [x23, x27]\n" + "fmla v2.4s, v18.4s, v5.4s\n" + "ldr s17, [x20, x27]\n" + "fmla v4.4s, v14.4s, v13.4s\n" + "ldr x22, [%[inptrs], 72]\n" + "fmla v3.4s, v15.4s, v7.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmax v8.4s, v8.4s, v11.4s\n" + "ldr s18, [x22, x27]\n" + "fmla v2.4s, v19.4s, v7.4s\n" + "ldr s13, [x20, x27]\n" + "fmla v4.4s, v17.4s, v10.4s\n" + "ldr x23, [%[inptrs], 144]\n" + "fmla v3.4s, v16.4s, v6.4s\n" + "ldr x20, [%[inptrs], 112]\n" + "fmin v8.4s, v8.4s, v12.4s\n" + "ldr s10, [x23, x27]\n" + "fmla v2.4s, v17.4s, v6.4s\n" + "ldr s15, [x20, x27]\n" + "fmla v4.4s, v13.4s, v9.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v3.4s, v14.4s, v0.4s\n" + "ldr x23, [%[inptrs], 152]\n" + "ldr s9, [x20, x27]\n" + "ldr x22, [%[outptrs], 0]\n" + "fmla v2.4s, v18.4s, v0.4s\n" + "ldr s19, [x23, x27]\n" + "str s8, [x22, x28]\n" + "fmla v4.4s, v10.4s, v5.4s\n" + "fmla v3.4s, v13.4s, v1.4s\n" + "ldr x20, [%[inptrs], 192]\n" + "ldr x22, [%[outptrs], 8]\n" + "ldr x24, [%[outptrs], 16]\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v2.4s, v15.4s, v1.4s\n" + "ldr s16, [x20, x27]\n" + "fmla v4.4s, v15.4s, v7.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmax v3.4s, v3.4s, v11.4s\n" + "add x27, x27, #4\n" + "fmax v2.4s, v2.4s, v11.4s\n" + "fmla v4.4s, v9.4s, v6.4s\n" + "fmin v3.4s, v3.4s, v12.4s\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "str s3, [x24, x28]\n" + "fmla v4.4s, v19.4s, v0.4s\n" + "str s2, [x22, x28]\n" + "ldr x24, [%[outptrs], 24]\n" + "fmla v4.4s, v16.4s, v1.4s\n" + "fmax v4.4s, v4.4s, v11.4s\n" + "fmin v4.4s, v4.4s, v12.4s\n" + "str s4, [x24, x28]\n" + "add x28, x28, #4\n" + "7:\n" + : [wbptr] "+r" (weight_bias_ptr) + : [inptrs] "r" (inptrs), [outptrs] "r" (outptrs), [n_channels] "r" ((long) n_channels) + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + ); +} + #endif // __aarch64__ template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float, float>; diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp index ff0e454c76..a583615c99 100644 --- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp +++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp @@ -1167,6 +1167,1155 @@ void Conv::execute_tile( ); } +template <> +template <> +void Conv::execute_tile( + int n_channels, + const void *weight_bias_ptr, + const float *inptrs[6][6], + float *outptrs[4][4] +) +{ + __asm __volatile( + "mov x27, xzr\n" + "mov x28, xzr\n" + "and x15, %[n_channels], #3\n" + "lsr x16, %[n_channels], #2\n" + "cbz x16, 4f\n" + "1:\n" + "ldr q13, [%[wbptr]]\n" + "ldr x17, [%[inptrs], 0]\n" + "mov v18.16b, v13.16b\n" + "ldr q12, [%[wbptr], #16]\n" + "mov v22.16b, v13.16b\n" + "ldr q11, [%[wbptr], #32]\n" + "mov v23.16b, v13.16b\n" + "ldr q10, [%[wbptr], #48]\n" + "mov v19.16b, v13.16b\n" + "ldr q9, [%[wbptr], #64]\n" + "mov v17.16b, v13.16b\n" + "ldr q8, [%[wbptr], #80]\n" + "mov v14.16b, v13.16b\n" + "ldr q7, [%[wbptr], #96]\n" + "mov v0.16b, v13.16b\n" + "ldr q6, [%[wbptr], #112]\n" + "mov v1.16b, v13.16b\n" + "ldr q5, [%[wbptr], #128]\n" + "mov v2.16b, v13.16b\n" + "ldr q4, [%[wbptr], #144]\n" + "ldr q29, [x17, x27]\n" + "ldr x18, [%[inptrs], 48]\n" + "fmla v18.4s, v29.4s, v12.4s\n" + "ldr x17, [%[inptrs], 8]\n" + "ldr q27, [x18, x27]\n" + "ldr x19, [%[inptrs], 96]\n" + "ldr q28, [x17, x27]\n" + "ldr x18, [%[inptrs], 56]\n" + "ldr q25, [x19, x27]\n" + "ldr x17, [%[inptrs], 16]\n" + "ldr q16, [x18, x27]\n" + "ldr x20, [%[inptrs], 144]\n" + "ldr q15, [x17, x27]\n" + "ldr x19, [%[inptrs], 104]\n" + "ldr q21, [x20, x27]\n" + "subs x16, x16, #1\n" + "ldr q29, [x19, x27]\n" + "beq 3f\n" + "2:\n" + "mov v3.16b, v13.16b\n" + "ldr x18, [%[inptrs], 64]\n" + "fmla v18.4s, v27.4s, v9.4s\n" + "ldr x17, [%[inptrs], 24]\n" + "fmla v22.4s, v27.4s, v12.4s\n" + "ldr q30, [x18, x27]\n" + "fmla v23.4s, v28.4s, v12.4s\n" + "ldr x21, [%[inptrs], 192]\n" + "fmla v19.4s, v25.4s, v12.4s\n" + "ldr x20, [%[inptrs], 152]\n" + "fmla v18.4s, v28.4s, v11.4s\n" + "ldr q24, [x17, x27]\n" + "fmla v22.4s, v25.4s, v9.4s\n" + "ldr x19, [%[inptrs], 112]\n" + "fmla v23.4s, v16.4s, v9.4s\n" + "ldr x18, [%[inptrs], 72]\n" + "fmla v17.4s, v16.4s, v12.4s\n" + "ldr x17, [%[inptrs], 32]\n" + "fmla v18.4s, v25.4s, v6.4s\n" + "ldr q31, [x21, x27]\n" + "fmla v22.4s, v16.4s, v11.4s\n" + "ldr x22, [%[inptrs], 240]\n" + "fmla v23.4s, v15.4s, v11.4s\n" + "ldr x21, [%[inptrs], 200]\n" + "fmla v14.4s, v15.4s, v12.4s\n" + "ldr x23, [%[outptrs], 0]\n" + "fmla v18.4s, v16.4s, v8.4s\n" + "ldr q25, [x20, x27]\n" + "fmla v22.4s, v21.4s, v6.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v19.4s, v21.4s, v9.4s\n" + "ldr x24, [%[outptrs], 32]\n" + "fmla v0.4s, v21.4s, v12.4s\n" + "ldr q21, [x19, x27]\n" + "fmla v18.4s, v15.4s, v10.4s\n" + "ldr q20, [x18, x27]\n" + "fmla v22.4s, v29.4s, v8.4s\n" + "ldr x19, [%[inptrs], 120]\n" + "fmla v23.4s, v29.4s, v6.4s\n" + "ldr x18, [%[inptrs], 80]\n" + "fmla v19.4s, v29.4s, v11.4s\n" + "ldr x25, [%[outptrs], 64]\n" + "fmla v18.4s, v29.4s, v5.4s\n" + "ldr x26, [%[outptrs], 96]\n" + "fmla v17.4s, v29.4s, v9.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v1.4s, v29.4s, v12.4s\n" + "ldr q26, [x17, x27]\n" + "fmla v22.4s, v30.4s, v10.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v18.4s, v30.4s, v7.4s\n" + "ldr x17, [%[inptrs], 40]\n" + "fmla v23.4s, v30.4s, v8.4s\n" + "subs x16, x16, #1\n" + "fmla v17.4s, v30.4s, v11.4s\n" + "fmla v14.4s, v30.4s, v9.4s\n" + "fmla v2.4s, v30.4s, v12.4s\n" + "ldr q27, [x22, x27]\n" + "fmla v3.4s, v24.4s, v12.4s\n" + "ldr x22, [%[inptrs], 248]\n" + "fmla v23.4s, v24.4s, v10.4s\n" + "fmla v19.4s, v31.4s, v6.4s\n" + "fmla v14.4s, v24.4s, v11.4s\n" + "ldr q30, [x21, x27]\n" + "fmla v0.4s, v31.4s, v9.4s\n" + "ldr q24, [x20, x27]\n" + "fmla v22.4s, v25.4s, v5.4s\n" + "ldr x21, [%[inptrs], 208]\n" + "fmla v19.4s, v25.4s, v8.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v17.4s, v25.4s, v6.4s\n" + "fmla v1.4s, v25.4s, v9.4s\n" + "fmla v0.4s, v25.4s, v11.4s\n" + "fmla v18.4s, v21.4s, v4.4s\n" + "fmla v22.4s, v21.4s, v7.4s\n" + "fmla v23.4s, v21.4s, v5.4s\n" + "fmla v19.4s, v21.4s, v10.4s\n" + "fmla v14.4s, v21.4s, v6.4s\n" + "fmla v17.4s, v21.4s, v8.4s\n" + "fmla v1.4s, v21.4s, v11.4s\n" + "str q18, [x23, x28]\n" + "mov v16.16b, v13.16b\n" + "fmla v2.4s, v21.4s, v9.4s\n" + "ldr x23, [%[outptrs], 8]\n" + "fmla v23.4s, v20.4s, v7.4s\n" + "fmla v14.4s, v20.4s, v8.4s\n" + "fmla v16.4s, v25.4s, v12.4s\n" + "ldr q25, [x19, x27]\n" + "fmla v17.4s, v20.4s, v10.4s\n" + "ldr x19, [%[inptrs], 128]\n" + "fmla v2.4s, v20.4s, v11.4s\n" + "fmla v3.4s, v20.4s, v9.4s\n" + "fmla v14.4s, v26.4s, v10.4s\n" + "fmla v0.4s, v27.4s, v6.4s\n" + "mov v15.16b, v13.16b\n" + "fmla v19.4s, v30.4s, v5.4s\n" + "fmla v1.4s, v30.4s, v6.4s\n" + "fmla v16.4s, v30.4s, v9.4s\n" + "fmla v3.4s, v26.4s, v11.4s\n" + "ldr q29, [x18, x27]\n" + "fmla v15.4s, v21.4s, v12.4s\n" + "ldr q27, [x17, x27]\n" + "fmla v0.4s, v30.4s, v8.4s\n" + "ldr q28, [x22, x27]\n" + "fmla v22.4s, v24.4s, v4.4s\n" + "ldr x18, [%[inptrs], 88]\n" + "fmla v19.4s, v24.4s, v7.4s\n" + "ldr x22, [%[inptrs], 256]\n" + "fmla v17.4s, v24.4s, v5.4s\n" + "ldr x17, [%[inptrs], 0]\n" + "fmla v0.4s, v24.4s, v10.4s\n" + "fmla v1.4s, v24.4s, v8.4s\n" + "str q22, [x24, x28]\n" + "mov v18.16b, v13.16b\n" + "fmla v2.4s, v24.4s, v6.4s\n" + "ldr x24, [%[outptrs], 40]\n" + "fmla v16.4s, v24.4s, v11.4s\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "fmla v18.4s, v20.4s, v12.4s\n" + "ldr q22, [x21, x27]\n" + "fmla v23.4s, v25.4s, v4.4s\n" + "ldr x21, [%[inptrs], 216]\n" + "fmla v17.4s, v25.4s, v7.4s\n" + "fmla v14.4s, v25.4s, v5.4s\n" + "fmla v1.4s, v25.4s, v10.4s\n" + "fmla v2.4s, v25.4s, v8.4s\n" + "fmla v3.4s, v25.4s, v6.4s\n" + "fmla v15.4s, v25.4s, v11.4s\n" + "str q23, [x23, x28]\n" + "mov v21.16b, v13.16b\n" + "fmla v18.4s, v25.4s, v9.4s\n" + "ldr x23, [%[outptrs], 16]\n" + "fmla v14.4s, v29.4s, v7.4s\n" + "fmla v2.4s, v29.4s, v10.4s\n" + "fmla v21.4s, v24.4s, v12.4s\n" + "ldr q30, [x20, x27]\n" + "fmla v3.4s, v29.4s, v8.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmla v18.4s, v29.4s, v11.4s\n" + "ldr q31, [x19, x27]\n" + "fmla v0.4s, v28.4s, v5.4s\n" + "ldr x19, [%[inptrs], 136]\n" + "fmla v16.4s, v28.4s, v6.4s\n" + "ldr q26, [x18, x27]\n" + "fmla v3.4s, v27.4s, v10.4s\n" + "ldr q23, [x22, x27]\n" + "fmla v19.4s, v22.4s, v4.4s\n" + "ldr x22, [%[inptrs], 264]\n" + "fmla v0.4s, v22.4s, v7.4s\n" + "ldr x18, [%[inptrs], 48]\n" + "fmla v1.4s, v22.4s, v5.4s\n" + "fmla v16.4s, v22.4s, v8.4s\n" + "fmla v15.4s, v22.4s, v6.4s\n" + "fmla v21.4s, v22.4s, v9.4s\n" + "str q19, [x25, x28]\n" + "mov v24.16b, v13.16b\n" + "mov v20.16b, v13.16b\n" + "ldr q27, [x21, x27]\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "ldr x21, [%[inptrs], 224]\n" + "fmla v24.4s, v25.4s, v12.4s\n" + "ldr q28, [x20, x27]\n" + "fmla v1.4s, v30.4s, v7.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v2.4s, v30.4s, v5.4s\n" + "ldr x25, [%[outptrs], 72]\n" + "str q17, [x24, x28]\n" + "fmla v16.4s, v30.4s, v10.4s\n" + "fmla v15.4s, v30.4s, v8.4s\n" + "ldr q22, [x19, x27]\n" + "fmla v18.4s, v30.4s, v6.4s\n" + "ldr x24, [%[outptrs], 48]\n" + "fmla v21.4s, v30.4s, v11.4s\n" + "ldr x19, [%[inptrs], 96]\n" + "fmla v24.4s, v30.4s, v9.4s\n" + "fmla v20.4s, v30.4s, v12.4s\n" + "fmla v14.4s, v31.4s, v4.4s\n" + "ldr q30, [x22, x27]\n" + "fmla v2.4s, v31.4s, v7.4s\n" + "ldr q19, [x21, x27]\n" + "fmla v3.4s, v31.4s, v5.4s\n" + "ldr x22, [%[inptrs], 272]\n" + "fmla v15.4s, v31.4s, v10.4s\n" + "ldr x21, [%[inptrs], 232]\n" + "str q14, [x23, x28]\n" + "fmla v18.4s, v31.4s, v8.4s\n" + "fmla v24.4s, v31.4s, v11.4s\n" + "ldr q31, [x20, x27]\n" + "fmla v3.4s, v26.4s, v7.4s\n" + "ldr q17, [x22, x27]\n" + "fmla v0.4s, v23.4s, v4.4s\n" + "ldr x22, [%[inptrs], 280]\n" + "fmla v18.4s, v26.4s, v10.4s\n" + "ldr q14, [x21, x27]\n" + "fmla v16.4s, v23.4s, v5.4s\n" + "ldr x23, [%[outptrs], 24]\n" + "fmla v21.4s, v23.4s, v6.4s\n" + "ldr q26, [x22, x27]\n" + "str q0, [x26, x28]\n" + "fmla v1.4s, v27.4s, v4.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "ldr q13, [%[wbptr]]\n" + "fmla v16.4s, v27.4s, v7.4s\n" + "ldr x26, [%[outptrs], 104]\n" + "fmla v21.4s, v27.4s, v8.4s\n" + "add x27, x27, #16\n" + "str q1, [x25, x28]\n" + "fmla v24.4s, v27.4s, v6.4s\n" + "fmla v20.4s, v27.4s, v9.4s\n" + "ldr q12, [%[wbptr], #16]\n" + "fmla v2.4s, v28.4s, v4.4s\n" + "ldr q29, [x17, x27]\n" + "fmla v15.4s, v28.4s, v7.4s\n" + "ldr q27, [x18, x27]\n" + "fmla v18.4s, v28.4s, v5.4s\n" + "ldr x25, [%[outptrs], 80]\n" + "fmla v21.4s, v28.4s, v10.4s\n" + "ldr x17, [%[inptrs], 8]\n" + "str q2, [x24, x28]\n" + "fmla v24.4s, v28.4s, v8.4s\n" + "fmla v20.4s, v28.4s, v11.4s\n" + "ldr q9, [%[wbptr], #64]\n" + "fmla v3.4s, v22.4s, v4.4s\n" + "ldr q28, [x17, x27]\n" + "fmla v18.4s, v22.4s, v7.4s\n" + "ldr q25, [x19, x27]\n" + "fmla v24.4s, v22.4s, v10.4s\n" + "ldr x24, [%[outptrs], 56]\n" + "fmla v16.4s, v30.4s, v4.4s\n" + "ldr q11, [%[wbptr], #32]\n" + "str q3, [x23, x28]\n" + "fmla v21.4s, v30.4s, v5.4s\n" + "fmla v20.4s, v30.4s, v6.4s\n" + "ldr x18, [%[inptrs], 56]\n" + "fmla v15.4s, v19.4s, v4.4s\n" + "ldr x17, [%[inptrs], 16]\n" + "str q16, [x26, x28]\n" + "fmla v24.4s, v19.4s, v5.4s\n" + "fmla v21.4s, v19.4s, v7.4s\n" + "ldr q16, [x18, x27]\n" + "fmla v20.4s, v19.4s, v8.4s\n" + "ldr q6, [%[wbptr], #112]\n" + "str q15, [x25, x28]\n" + "fmla v18.4s, v31.4s, v4.4s\n" + "fmla v24.4s, v31.4s, v7.4s\n" + "ldr q15, [x17, x27]\n" + "fmla v21.4s, v17.4s, v4.4s\n" + "ldr x25, [%[outptrs], 88]\n" + "fmla v20.4s, v31.4s, v10.4s\n" + "ldr q8, [%[wbptr], #80]\n" + "str q18, [x24, x28]\n" + "mov v18.16b, v13.16b\n" + "fmla v24.4s, v14.4s, v4.4s\n" + "ldr x26, [%[outptrs], 112]\n" + "mov v22.16b, v13.16b\n" + "ldr x20, [%[inptrs], 144]\n" + "str q21, [x26, x28]\n" + "fmla v20.4s, v17.4s, v5.4s\n" + "mov v23.16b, v13.16b\n" + "ldr q10, [%[wbptr], #48]\n" + "str q24, [x25, x28]\n" + "mov v19.16b, v13.16b\n" + "mov v17.16b, v13.16b\n" + "ldr q21, [x20, x27]\n" + "fmla v20.4s, v14.4s, v7.4s\n" + "ldr q5, [%[wbptr], #128]\n" + "mov v14.16b, v13.16b\n" + "ldr x26, [%[outptrs], 120]\n" + "mov v0.16b, v13.16b\n" + "ldr x19, [%[inptrs], 104]\n" + "mov v1.16b, v13.16b\n" + "mov v2.16b, v13.16b\n" + "fmla v20.4s, v26.4s, v4.4s\n" + "ldr q7, [%[wbptr], #96]\n" + "fmla v18.4s, v29.4s, v12.4s\n" + "ldr q29, [x19, x27]\n" + "str q20, [x26, x28]\n" + "ldr q4, [%[wbptr], #144]\n" + "add x28, x28, #16\n" + "bne 2b\n" + "3:\n" + "mov v3.16b, v13.16b\n" + "ldr x18, [%[inptrs], 64]\n" + "fmla v18.4s, v27.4s, v9.4s\n" + "ldr x17, [%[inptrs], 24]\n" + "fmla v22.4s, v27.4s, v12.4s\n" + "ldr q30, [x18, x27]\n" + "fmla v23.4s, v28.4s, v12.4s\n" + "ldr x21, [%[inptrs], 192]\n" + "fmla v19.4s, v25.4s, v12.4s\n" + "ldr x20, [%[inptrs], 152]\n" + "fmla v18.4s, v28.4s, v11.4s\n" + "ldr q24, [x17, x27]\n" + "fmla v22.4s, v25.4s, v9.4s\n" + "ldr x19, [%[inptrs], 112]\n" + "fmla v23.4s, v16.4s, v9.4s\n" + "ldr x18, [%[inptrs], 72]\n" + "fmla v17.4s, v16.4s, v12.4s\n" + "ldr x17, [%[inptrs], 32]\n" + "fmla v18.4s, v25.4s, v6.4s\n" + "ldr q31, [x21, x27]\n" + "fmla v22.4s, v16.4s, v11.4s\n" + "ldr x22, [%[inptrs], 240]\n" + "fmla v23.4s, v15.4s, v11.4s\n" + "ldr x21, [%[inptrs], 200]\n" + "fmla v14.4s, v15.4s, v12.4s\n" + "ldr x23, [%[outptrs], 0]\n" + "fmla v18.4s, v16.4s, v8.4s\n" + "ldr q25, [x20, x27]\n" + "fmla v22.4s, v21.4s, v6.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v19.4s, v21.4s, v9.4s\n" + "ldr x24, [%[outptrs], 32]\n" + "fmla v0.4s, v21.4s, v12.4s\n" + "ldr q21, [x19, x27]\n" + "fmla v18.4s, v15.4s, v10.4s\n" + "ldr q20, [x18, x27]\n" + "fmla v22.4s, v29.4s, v8.4s\n" + "ldr x19, [%[inptrs], 120]\n" + "fmla v23.4s, v29.4s, v6.4s\n" + "ldr x18, [%[inptrs], 80]\n" + "fmla v19.4s, v29.4s, v11.4s\n" + "ldr x25, [%[outptrs], 64]\n" + "fmla v18.4s, v29.4s, v5.4s\n" + "ldr x26, [%[outptrs], 96]\n" + "fmla v17.4s, v29.4s, v9.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v1.4s, v29.4s, v12.4s\n" + "ldr q26, [x17, x27]\n" + "fmla v22.4s, v30.4s, v10.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v18.4s, v30.4s, v7.4s\n" + "ldr x17, [%[inptrs], 40]\n" + "fmla v23.4s, v30.4s, v8.4s\n" + "fmla v17.4s, v30.4s, v11.4s\n" + "fmla v14.4s, v30.4s, v9.4s\n" + "fmla v2.4s, v30.4s, v12.4s\n" + "mov v16.16b, v13.16b\n" + "fmla v3.4s, v24.4s, v12.4s\n" + "fmla v19.4s, v31.4s, v6.4s\n" + "fmla v0.4s, v31.4s, v9.4s\n" + "mov v15.16b, v13.16b\n" + "fmla v23.4s, v24.4s, v10.4s\n" + "fmla v14.4s, v24.4s, v11.4s\n" + "ldr q27, [x22, x27]\n" + "fmla v22.4s, v25.4s, v5.4s\n" + "ldr x22, [%[inptrs], 248]\n" + "fmla v19.4s, v25.4s, v8.4s\n" + "fmla v17.4s, v25.4s, v6.4s\n" + "fmla v0.4s, v25.4s, v11.4s\n" + "fmla v1.4s, v25.4s, v9.4s\n" + "fmla v16.4s, v25.4s, v12.4s\n" + "ldr q30, [x21, x27]\n" + "fmla v18.4s, v21.4s, v4.4s\n" + "ldr x21, [%[inptrs], 208]\n" + "fmla v22.4s, v21.4s, v7.4s\n" + "fmla v23.4s, v21.4s, v5.4s\n" + "fmla v19.4s, v21.4s, v10.4s\n" + "fmla v17.4s, v21.4s, v8.4s\n" + "fmla v14.4s, v21.4s, v6.4s\n" + "fmla v1.4s, v21.4s, v11.4s\n" + "str q18, [x23, x28]\n" + "mov v18.16b, v13.16b\n" + "fmla v2.4s, v21.4s, v9.4s\n" + "ldr x23, [%[outptrs], 8]\n" + "fmla v15.4s, v21.4s, v12.4s\n" + "ldr q24, [x20, x27]\n" + "fmla v23.4s, v20.4s, v7.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v17.4s, v20.4s, v10.4s\n" + "fmla v14.4s, v20.4s, v8.4s\n" + "fmla v2.4s, v20.4s, v11.4s\n" + "fmla v3.4s, v20.4s, v9.4s\n" + "fmla v18.4s, v20.4s, v12.4s\n" + "ldr q25, [x19, x27]\n" + "fmla v0.4s, v27.4s, v6.4s\n" + "ldr q29, [x18, x27]\n" + "fmla v14.4s, v26.4s, v10.4s\n" + "ldr x19, [%[inptrs], 128]\n" + "fmla v3.4s, v26.4s, v11.4s\n" + "ldr q27, [x17, x27]\n" + "fmla v19.4s, v30.4s, v5.4s\n" + "ldr x18, [%[inptrs], 88]\n" + "fmla v0.4s, v30.4s, v8.4s\n" + "fmla v1.4s, v30.4s, v6.4s\n" + "fmla v16.4s, v30.4s, v9.4s\n" + "ldr q28, [x22, x27]\n" + "fmla v22.4s, v24.4s, v4.4s\n" + "ldr x22, [%[inptrs], 256]\n" + "fmla v19.4s, v24.4s, v7.4s\n" + "fmla v17.4s, v24.4s, v5.4s\n" + "fmla v0.4s, v24.4s, v10.4s\n" + "fmla v1.4s, v24.4s, v8.4s\n" + "fmla v2.4s, v24.4s, v6.4s\n" + "fmla v16.4s, v24.4s, v11.4s\n" + "str q22, [x24, x28]\n" + "mov v21.16b, v13.16b\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "ldr x24, [%[outptrs], 40]\n" + "fmla v23.4s, v25.4s, v4.4s\n" + "fmla v17.4s, v25.4s, v7.4s\n" + "fmla v21.4s, v24.4s, v12.4s\n" + "ldr q22, [x21, x27]\n" + "fmla v14.4s, v25.4s, v5.4s\n" + "ldr x21, [%[inptrs], 216]\n" + "fmla v1.4s, v25.4s, v10.4s\n" + "fmla v2.4s, v25.4s, v8.4s\n" + "str q23, [x23, x28]\n" + "mov v24.16b, v13.16b\n" + "mov v20.16b, v13.16b\n" + "ldr x23, [%[outptrs], 16]\n" + "fmla v3.4s, v25.4s, v6.4s\n" + "fmla v15.4s, v25.4s, v11.4s\n" + "fmla v18.4s, v25.4s, v9.4s\n" + "fmla v24.4s, v25.4s, v12.4s\n" + "fmla v14.4s, v29.4s, v7.4s\n" + "ldr q30, [x20, x27]\n" + "fmla v2.4s, v29.4s, v10.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmla v3.4s, v29.4s, v8.4s\n" + "fmla v0.4s, v28.4s, v5.4s\n" + "fmla v18.4s, v29.4s, v11.4s\n" + "ldr q31, [x19, x27]\n" + "fmla v16.4s, v28.4s, v6.4s\n" + "ldr q26, [x18, x27]\n" + "fmla v19.4s, v22.4s, v4.4s\n" + "ldr x19, [%[inptrs], 136]\n" + "fmla v3.4s, v27.4s, v10.4s\n" + "ldr q23, [x22, x27]\n" + "fmla v0.4s, v22.4s, v7.4s\n" + "ldr x22, [%[inptrs], 264]\n" + "fmla v1.4s, v22.4s, v5.4s\n" + "fmla v16.4s, v22.4s, v8.4s\n" + "str q19, [x25, x28]\n" + "fmla v15.4s, v22.4s, v6.4s\n" + "fmla v21.4s, v22.4s, v9.4s\n" + "ldr q27, [x21, x27]\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "ldr q28, [x20, x27]\n" + "fmla v1.4s, v30.4s, v7.4s\n" + "ldr x21, [%[inptrs], 224]\n" + "fmla v2.4s, v30.4s, v5.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v16.4s, v30.4s, v10.4s\n" + "ldr x25, [%[outptrs], 72]\n" + "str q17, [x24, x28]\n" + "fmla v15.4s, v30.4s, v8.4s\n" + "fmla v18.4s, v30.4s, v6.4s\n" + "ldr q22, [x19, x27]\n" + "fmla v21.4s, v30.4s, v11.4s\n" + "ldr x24, [%[outptrs], 48]\n" + "fmla v24.4s, v30.4s, v9.4s\n" + "fmla v20.4s, v30.4s, v12.4s\n" + "fmla v14.4s, v31.4s, v4.4s\n" + "ldr q30, [x22, x27]\n" + "fmla v2.4s, v31.4s, v7.4s\n" + "ldr q19, [x21, x27]\n" + "fmla v3.4s, v31.4s, v5.4s\n" + "ldr x22, [%[inptrs], 272]\n" + "fmla v15.4s, v31.4s, v10.4s\n" + "ldr x21, [%[inptrs], 232]\n" + "str q14, [x23, x28]\n" + "fmla v18.4s, v31.4s, v8.4s\n" + "fmla v24.4s, v31.4s, v11.4s\n" + "ldr q31, [x20, x27]\n" + "fmla v3.4s, v26.4s, v7.4s\n" + "ldr q17, [x22, x27]\n" + "fmla v0.4s, v23.4s, v4.4s\n" + "ldr x22, [%[inptrs], 280]\n" + "fmla v18.4s, v26.4s, v10.4s\n" + "ldr q14, [x21, x27]\n" + "fmla v16.4s, v23.4s, v5.4s\n" + "ldr x23, [%[outptrs], 24]\n" + "fmla v21.4s, v23.4s, v6.4s\n" + "ldr q26, [x22, x27]\n" + "str q0, [x26, x28]\n" + "fmla v1.4s, v27.4s, v4.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "ldr x26, [%[outptrs], 104]\n" + "fmla v16.4s, v27.4s, v7.4s\n" + "add x27, x27, #16\n" + "fmla v21.4s, v27.4s, v8.4s\n" + "fmla v24.4s, v27.4s, v6.4s\n" + "str q1, [x25, x28]\n" + "fmla v20.4s, v27.4s, v9.4s\n" + "fmla v2.4s, v28.4s, v4.4s\n" + "ldr x25, [%[outptrs], 80]\n" + "fmla v15.4s, v28.4s, v7.4s\n" + "fmla v18.4s, v28.4s, v5.4s\n" + "fmla v21.4s, v28.4s, v10.4s\n" + "fmla v24.4s, v28.4s, v8.4s\n" + "fmla v20.4s, v28.4s, v11.4s\n" + "fmla v3.4s, v22.4s, v4.4s\n" + "str q2, [x24, x28]\n" + "fmla v16.4s, v30.4s, v4.4s\n" + "fmla v18.4s, v22.4s, v7.4s\n" + "ldr x24, [%[outptrs], 56]\n" + "fmla v24.4s, v22.4s, v10.4s\n" + "fmla v21.4s, v30.4s, v5.4s\n" + "str q3, [x23, x28]\n" + "fmla v20.4s, v30.4s, v6.4s\n" + "str q16, [x26, x28]\n" + "fmla v15.4s, v19.4s, v4.4s\n" + "fmla v18.4s, v31.4s, v4.4s\n" + "ldr x26, [%[outptrs], 112]\n" + "fmla v21.4s, v19.4s, v7.4s\n" + "fmla v24.4s, v19.4s, v5.4s\n" + "fmla v20.4s, v19.4s, v8.4s\n" + "str q15, [x25, x28]\n" + "str q18, [x24, x28]\n" + "ldr x25, [%[outptrs], 88]\n" + "fmla v24.4s, v31.4s, v7.4s\n" + "fmla v21.4s, v17.4s, v4.4s\n" + "fmla v20.4s, v31.4s, v10.4s\n" + "str q21, [x26, x28]\n" + "fmla v20.4s, v17.4s, v5.4s\n" + "ldr x26, [%[outptrs], 120]\n" + "fmla v24.4s, v14.4s, v4.4s\n" + "fmla v20.4s, v14.4s, v7.4s\n" + "str q24, [x25, x28]\n" + "fmla v20.4s, v26.4s, v4.4s\n" + "str q20, [x26, x28]\n" + "add x28, x28, #16\n" + "4:\n" + "cbz x15, 7f\n" + "ldr s13, [%[wbptr]]\n" + "mov v18.16b, v13.16b\n" + "ldr s12, [%[wbptr], #4]\n" + "mov v22.16b, v13.16b\n" + "ldr s11, [%[wbptr], #8]\n" + "mov v23.16b, v13.16b\n" + "ldr s10, [%[wbptr], #12]\n" + "mov v19.16b, v13.16b\n" + "ldr s9, [%[wbptr], #16]\n" + "mov v17.16b, v13.16b\n" + "ldr s8, [%[wbptr], #20]\n" + "mov v14.16b, v13.16b\n" + "ldr s7, [%[wbptr], #24]\n" + "mov v0.16b, v13.16b\n" + "ldr s6, [%[wbptr], #28]\n" + "mov v1.16b, v13.16b\n" + "ldr s5, [%[wbptr], #32]\n" + "mov v2.16b, v13.16b\n" + "ldr s4, [%[wbptr], #36]\n" + "ldr x17, [%[inptrs], 0]\n" + "ldr x18, [%[inptrs], 48]\n" + "ldr x19, [%[inptrs], 96]\n" + "ldr x20, [%[inptrs], 144]\n" + "subs x15, x15, #1\n" + "ldr s29, [x17, x27]\n" + "fmla v18.4s, v29.4s, v12.4s\n" + "ldr s27, [x18, x27]\n" + "ldr s25, [x19, x27]\n" + "ldr x17, [%[inptrs], 8]\n" + "ldr s21, [x20, x27]\n" + "ldr x18, [%[inptrs], 56]\n" + "ldr s28, [x17, x27]\n" + "ldr x19, [%[inptrs], 104]\n" + "ldr s16, [x18, x27]\n" + "ldr x17, [%[inptrs], 16]\n" + "ldr s29, [x19, x27]\n" + "ldr s15, [x17, x27]\n" + "beq 6f\n" + "5:\n" + "mov v3.16b, v13.16b\n" + "ldr x18, [%[inptrs], 64]\n" + "fmla v18.4s, v27.4s, v9.4s\n" + "ldr x17, [%[inptrs], 24]\n" + "fmla v22.4s, v27.4s, v12.4s\n" + "ldr s30, [x18, x27]\n" + "fmla v23.4s, v28.4s, v12.4s\n" + "ldr x21, [%[inptrs], 192]\n" + "fmla v19.4s, v25.4s, v12.4s\n" + "ldr x20, [%[inptrs], 152]\n" + "fmla v18.4s, v28.4s, v11.4s\n" + "ldr s24, [x17, x27]\n" + "fmla v22.4s, v25.4s, v9.4s\n" + "ldr x19, [%[inptrs], 112]\n" + "fmla v23.4s, v16.4s, v9.4s\n" + "ldr x18, [%[inptrs], 72]\n" + "fmla v17.4s, v16.4s, v12.4s\n" + "ldr x17, [%[inptrs], 32]\n" + "fmla v18.4s, v25.4s, v6.4s\n" + "ldr s31, [x21, x27]\n" + "fmla v22.4s, v16.4s, v11.4s\n" + "ldr x22, [%[inptrs], 240]\n" + "fmla v23.4s, v15.4s, v11.4s\n" + "ldr x21, [%[inptrs], 200]\n" + "fmla v14.4s, v15.4s, v12.4s\n" + "ldr x23, [%[outptrs], 0]\n" + "fmla v18.4s, v16.4s, v8.4s\n" + "ldr s25, [x20, x27]\n" + "fmla v22.4s, v21.4s, v6.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v19.4s, v21.4s, v9.4s\n" + "ldr x24, [%[outptrs], 32]\n" + "fmla v0.4s, v21.4s, v12.4s\n" + "ldr s21, [x19, x27]\n" + "fmla v18.4s, v15.4s, v10.4s\n" + "ldr s20, [x18, x27]\n" + "fmla v22.4s, v29.4s, v8.4s\n" + "ldr x19, [%[inptrs], 120]\n" + "fmla v23.4s, v29.4s, v6.4s\n" + "ldr x18, [%[inptrs], 80]\n" + "fmla v19.4s, v29.4s, v11.4s\n" + "ldr x25, [%[outptrs], 64]\n" + "fmla v18.4s, v29.4s, v5.4s\n" + "ldr x26, [%[outptrs], 96]\n" + "fmla v17.4s, v29.4s, v9.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v1.4s, v29.4s, v12.4s\n" + "ldr s26, [x17, x27]\n" + "fmla v22.4s, v30.4s, v10.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v18.4s, v30.4s, v7.4s\n" + "ldr x17, [%[inptrs], 40]\n" + "fmla v23.4s, v30.4s, v8.4s\n" + "subs x15, x15, #1\n" + "fmla v17.4s, v30.4s, v11.4s\n" + "fmla v14.4s, v30.4s, v9.4s\n" + "fmla v2.4s, v30.4s, v12.4s\n" + "ldr s27, [x22, x27]\n" + "fmla v3.4s, v24.4s, v12.4s\n" + "ldr x22, [%[inptrs], 248]\n" + "fmla v23.4s, v24.4s, v10.4s\n" + "fmla v19.4s, v31.4s, v6.4s\n" + "fmla v14.4s, v24.4s, v11.4s\n" + "ldr s30, [x21, x27]\n" + "fmla v0.4s, v31.4s, v9.4s\n" + "ldr s24, [x20, x27]\n" + "fmla v22.4s, v25.4s, v5.4s\n" + "ldr x21, [%[inptrs], 208]\n" + "fmla v19.4s, v25.4s, v8.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v17.4s, v25.4s, v6.4s\n" + "fmla v1.4s, v25.4s, v9.4s\n" + "fmla v0.4s, v25.4s, v11.4s\n" + "fmla v18.4s, v21.4s, v4.4s\n" + "fmla v22.4s, v21.4s, v7.4s\n" + "fmla v23.4s, v21.4s, v5.4s\n" + "fmla v19.4s, v21.4s, v10.4s\n" + "fmla v14.4s, v21.4s, v6.4s\n" + "fmla v17.4s, v21.4s, v8.4s\n" + "fmla v1.4s, v21.4s, v11.4s\n" + "str s18, [x23, x28]\n" + "mov v16.16b, v13.16b\n" + "fmla v2.4s, v21.4s, v9.4s\n" + "ldr x23, [%[outptrs], 8]\n" + "fmla v23.4s, v20.4s, v7.4s\n" + "fmla v14.4s, v20.4s, v8.4s\n" + "fmla v16.4s, v25.4s, v12.4s\n" + "ldr s25, [x19, x27]\n" + "fmla v17.4s, v20.4s, v10.4s\n" + "ldr x19, [%[inptrs], 128]\n" + "fmla v2.4s, v20.4s, v11.4s\n" + "fmla v3.4s, v20.4s, v9.4s\n" + "fmla v14.4s, v26.4s, v10.4s\n" + "fmla v0.4s, v27.4s, v6.4s\n" + "mov v15.16b, v13.16b\n" + "fmla v19.4s, v30.4s, v5.4s\n" + "fmla v1.4s, v30.4s, v6.4s\n" + "fmla v16.4s, v30.4s, v9.4s\n" + "fmla v3.4s, v26.4s, v11.4s\n" + "ldr s29, [x18, x27]\n" + "fmla v15.4s, v21.4s, v12.4s\n" + "ldr s27, [x17, x27]\n" + "fmla v0.4s, v30.4s, v8.4s\n" + "ldr s28, [x22, x27]\n" + "fmla v22.4s, v24.4s, v4.4s\n" + "ldr x18, [%[inptrs], 88]\n" + "fmla v19.4s, v24.4s, v7.4s\n" + "ldr x22, [%[inptrs], 256]\n" + "fmla v17.4s, v24.4s, v5.4s\n" + "ldr x17, [%[inptrs], 0]\n" + "fmla v0.4s, v24.4s, v10.4s\n" + "fmla v1.4s, v24.4s, v8.4s\n" + "str s22, [x24, x28]\n" + "mov v18.16b, v13.16b\n" + "fmla v2.4s, v24.4s, v6.4s\n" + "ldr x24, [%[outptrs], 40]\n" + "fmla v16.4s, v24.4s, v11.4s\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "fmla v18.4s, v20.4s, v12.4s\n" + "ldr s22, [x21, x27]\n" + "fmla v23.4s, v25.4s, v4.4s\n" + "ldr x21, [%[inptrs], 216]\n" + "fmla v17.4s, v25.4s, v7.4s\n" + "fmla v14.4s, v25.4s, v5.4s\n" + "fmla v1.4s, v25.4s, v10.4s\n" + "fmla v2.4s, v25.4s, v8.4s\n" + "fmla v3.4s, v25.4s, v6.4s\n" + "fmla v15.4s, v25.4s, v11.4s\n" + "str s23, [x23, x28]\n" + "mov v21.16b, v13.16b\n" + "fmla v18.4s, v25.4s, v9.4s\n" + "ldr x23, [%[outptrs], 16]\n" + "fmla v14.4s, v29.4s, v7.4s\n" + "fmla v2.4s, v29.4s, v10.4s\n" + "fmla v21.4s, v24.4s, v12.4s\n" + "ldr s30, [x20, x27]\n" + "fmla v3.4s, v29.4s, v8.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmla v18.4s, v29.4s, v11.4s\n" + "ldr s31, [x19, x27]\n" + "fmla v0.4s, v28.4s, v5.4s\n" + "ldr x19, [%[inptrs], 136]\n" + "fmla v16.4s, v28.4s, v6.4s\n" + "ldr s26, [x18, x27]\n" + "fmla v3.4s, v27.4s, v10.4s\n" + "ldr s23, [x22, x27]\n" + "fmla v19.4s, v22.4s, v4.4s\n" + "ldr x22, [%[inptrs], 264]\n" + "fmla v0.4s, v22.4s, v7.4s\n" + "ldr x18, [%[inptrs], 48]\n" + "fmla v1.4s, v22.4s, v5.4s\n" + "fmla v16.4s, v22.4s, v8.4s\n" + "fmla v15.4s, v22.4s, v6.4s\n" + "fmla v21.4s, v22.4s, v9.4s\n" + "str s19, [x25, x28]\n" + "mov v24.16b, v13.16b\n" + "mov v20.16b, v13.16b\n" + "ldr s27, [x21, x27]\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "ldr x21, [%[inptrs], 224]\n" + "fmla v24.4s, v25.4s, v12.4s\n" + "ldr s28, [x20, x27]\n" + "fmla v1.4s, v30.4s, v7.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v2.4s, v30.4s, v5.4s\n" + "ldr x25, [%[outptrs], 72]\n" + "str s17, [x24, x28]\n" + "fmla v16.4s, v30.4s, v10.4s\n" + "fmla v15.4s, v30.4s, v8.4s\n" + "ldr s22, [x19, x27]\n" + "fmla v18.4s, v30.4s, v6.4s\n" + "ldr x24, [%[outptrs], 48]\n" + "fmla v21.4s, v30.4s, v11.4s\n" + "ldr x19, [%[inptrs], 96]\n" + "fmla v24.4s, v30.4s, v9.4s\n" + "fmla v20.4s, v30.4s, v12.4s\n" + "fmla v14.4s, v31.4s, v4.4s\n" + "ldr s30, [x22, x27]\n" + "fmla v2.4s, v31.4s, v7.4s\n" + "ldr s19, [x21, x27]\n" + "fmla v3.4s, v31.4s, v5.4s\n" + "ldr x22, [%[inptrs], 272]\n" + "fmla v15.4s, v31.4s, v10.4s\n" + "ldr x21, [%[inptrs], 232]\n" + "str s14, [x23, x28]\n" + "fmla v18.4s, v31.4s, v8.4s\n" + "fmla v24.4s, v31.4s, v11.4s\n" + "ldr s31, [x20, x27]\n" + "fmla v3.4s, v26.4s, v7.4s\n" + "ldr s17, [x22, x27]\n" + "fmla v0.4s, v23.4s, v4.4s\n" + "ldr x22, [%[inptrs], 280]\n" + "fmla v18.4s, v26.4s, v10.4s\n" + "ldr s14, [x21, x27]\n" + "fmla v16.4s, v23.4s, v5.4s\n" + "ldr x23, [%[outptrs], 24]\n" + "fmla v21.4s, v23.4s, v6.4s\n" + "ldr s26, [x22, x27]\n" + "str s0, [x26, x28]\n" + "fmla v1.4s, v27.4s, v4.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "ldr s13, [%[wbptr]]\n" + "fmla v16.4s, v27.4s, v7.4s\n" + "ldr x26, [%[outptrs], 104]\n" + "fmla v21.4s, v27.4s, v8.4s\n" + "add x27, x27, #4\n" + "str s1, [x25, x28]\n" + "fmla v24.4s, v27.4s, v6.4s\n" + "fmla v20.4s, v27.4s, v9.4s\n" + "ldr s12, [%[wbptr], #4]\n" + "fmla v2.4s, v28.4s, v4.4s\n" + "ldr s29, [x17, x27]\n" + "fmla v15.4s, v28.4s, v7.4s\n" + "ldr s27, [x18, x27]\n" + "fmla v18.4s, v28.4s, v5.4s\n" + "ldr x25, [%[outptrs], 80]\n" + "fmla v21.4s, v28.4s, v10.4s\n" + "ldr x17, [%[inptrs], 8]\n" + "str s2, [x24, x28]\n" + "fmla v24.4s, v28.4s, v8.4s\n" + "fmla v20.4s, v28.4s, v11.4s\n" + "ldr s9, [%[wbptr], #16]\n" + "fmla v3.4s, v22.4s, v4.4s\n" + "ldr s28, [x17, x27]\n" + "fmla v18.4s, v22.4s, v7.4s\n" + "ldr s25, [x19, x27]\n" + "fmla v24.4s, v22.4s, v10.4s\n" + "ldr x24, [%[outptrs], 56]\n" + "fmla v16.4s, v30.4s, v4.4s\n" + "ldr s11, [%[wbptr], #8]\n" + "str s3, [x23, x28]\n" + "fmla v21.4s, v30.4s, v5.4s\n" + "fmla v20.4s, v30.4s, v6.4s\n" + "ldr x18, [%[inptrs], 56]\n" + "fmla v15.4s, v19.4s, v4.4s\n" + "ldr x17, [%[inptrs], 16]\n" + "str s16, [x26, x28]\n" + "fmla v24.4s, v19.4s, v5.4s\n" + "fmla v21.4s, v19.4s, v7.4s\n" + "ldr s16, [x18, x27]\n" + "fmla v20.4s, v19.4s, v8.4s\n" + "ldr s6, [%[wbptr], #28]\n" + "str s15, [x25, x28]\n" + "fmla v18.4s, v31.4s, v4.4s\n" + "fmla v24.4s, v31.4s, v7.4s\n" + "ldr s15, [x17, x27]\n" + "fmla v21.4s, v17.4s, v4.4s\n" + "ldr x25, [%[outptrs], 88]\n" + "fmla v20.4s, v31.4s, v10.4s\n" + "ldr s8, [%[wbptr], #20]\n" + "str s18, [x24, x28]\n" + "mov v18.16b, v13.16b\n" + "fmla v24.4s, v14.4s, v4.4s\n" + "ldr x26, [%[outptrs], 112]\n" + "mov v22.16b, v13.16b\n" + "ldr x20, [%[inptrs], 144]\n" + "str s21, [x26, x28]\n" + "fmla v20.4s, v17.4s, v5.4s\n" + "mov v23.16b, v13.16b\n" + "ldr s10, [%[wbptr], #12]\n" + "str s24, [x25, x28]\n" + "mov v19.16b, v13.16b\n" + "mov v17.16b, v13.16b\n" + "ldr s21, [x20, x27]\n" + "fmla v20.4s, v14.4s, v7.4s\n" + "ldr s5, [%[wbptr], #32]\n" + "mov v14.16b, v13.16b\n" + "ldr x26, [%[outptrs], 120]\n" + "mov v0.16b, v13.16b\n" + "ldr x19, [%[inptrs], 104]\n" + "mov v1.16b, v13.16b\n" + "mov v2.16b, v13.16b\n" + "fmla v20.4s, v26.4s, v4.4s\n" + "ldr s7, [%[wbptr], #24]\n" + "fmla v18.4s, v29.4s, v12.4s\n" + "ldr s29, [x19, x27]\n" + "str s20, [x26, x28]\n" + "ldr s4, [%[wbptr], #36]\n" + "add x28, x28, #4\n" + "bne 5b\n" + "6:\n" + "mov v3.16b, v13.16b\n" + "ldr x18, [%[inptrs], 64]\n" + "fmla v18.4s, v27.4s, v9.4s\n" + "ldr x17, [%[inptrs], 24]\n" + "fmla v22.4s, v27.4s, v12.4s\n" + "ldr s30, [x18, x27]\n" + "fmla v23.4s, v28.4s, v12.4s\n" + "ldr x21, [%[inptrs], 192]\n" + "fmla v19.4s, v25.4s, v12.4s\n" + "ldr x20, [%[inptrs], 152]\n" + "fmla v18.4s, v28.4s, v11.4s\n" + "ldr s24, [x17, x27]\n" + "fmla v22.4s, v25.4s, v9.4s\n" + "ldr x19, [%[inptrs], 112]\n" + "fmla v23.4s, v16.4s, v9.4s\n" + "ldr x18, [%[inptrs], 72]\n" + "fmla v17.4s, v16.4s, v12.4s\n" + "ldr x17, [%[inptrs], 32]\n" + "fmla v18.4s, v25.4s, v6.4s\n" + "ldr s31, [x21, x27]\n" + "fmla v22.4s, v16.4s, v11.4s\n" + "ldr x22, [%[inptrs], 240]\n" + "fmla v23.4s, v15.4s, v11.4s\n" + "ldr x21, [%[inptrs], 200]\n" + "fmla v14.4s, v15.4s, v12.4s\n" + "ldr x23, [%[outptrs], 0]\n" + "fmla v18.4s, v16.4s, v8.4s\n" + "ldr s25, [x20, x27]\n" + "fmla v22.4s, v21.4s, v6.4s\n" + "ldr x20, [%[inptrs], 160]\n" + "fmla v19.4s, v21.4s, v9.4s\n" + "ldr x24, [%[outptrs], 32]\n" + "fmla v0.4s, v21.4s, v12.4s\n" + "ldr s21, [x19, x27]\n" + "fmla v18.4s, v15.4s, v10.4s\n" + "ldr s20, [x18, x27]\n" + "fmla v22.4s, v29.4s, v8.4s\n" + "ldr x19, [%[inptrs], 120]\n" + "fmla v23.4s, v29.4s, v6.4s\n" + "ldr x18, [%[inptrs], 80]\n" + "fmla v19.4s, v29.4s, v11.4s\n" + "ldr x25, [%[outptrs], 64]\n" + "fmla v18.4s, v29.4s, v5.4s\n" + "ldr x26, [%[outptrs], 96]\n" + "fmla v17.4s, v29.4s, v9.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v1.4s, v29.4s, v12.4s\n" + "ldr s26, [x17, x27]\n" + "fmla v22.4s, v30.4s, v10.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v18.4s, v30.4s, v7.4s\n" + "ldr x17, [%[inptrs], 40]\n" + "fmla v23.4s, v30.4s, v8.4s\n" + "fmla v17.4s, v30.4s, v11.4s\n" + "fmla v14.4s, v30.4s, v9.4s\n" + "fmla v2.4s, v30.4s, v12.4s\n" + "mov v16.16b, v13.16b\n" + "fmla v3.4s, v24.4s, v12.4s\n" + "fmla v19.4s, v31.4s, v6.4s\n" + "fmla v0.4s, v31.4s, v9.4s\n" + "mov v15.16b, v13.16b\n" + "fmla v23.4s, v24.4s, v10.4s\n" + "fmla v14.4s, v24.4s, v11.4s\n" + "ldr s27, [x22, x27]\n" + "fmla v22.4s, v25.4s, v5.4s\n" + "ldr x22, [%[inptrs], 248]\n" + "fmla v19.4s, v25.4s, v8.4s\n" + "fmla v17.4s, v25.4s, v6.4s\n" + "fmla v0.4s, v25.4s, v11.4s\n" + "fmla v1.4s, v25.4s, v9.4s\n" + "fmla v16.4s, v25.4s, v12.4s\n" + "ldr s30, [x21, x27]\n" + "fmla v18.4s, v21.4s, v4.4s\n" + "ldr x21, [%[inptrs], 208]\n" + "fmla v22.4s, v21.4s, v7.4s\n" + "fmla v23.4s, v21.4s, v5.4s\n" + "fmla v19.4s, v21.4s, v10.4s\n" + "fmla v17.4s, v21.4s, v8.4s\n" + "fmla v14.4s, v21.4s, v6.4s\n" + "fmla v1.4s, v21.4s, v11.4s\n" + "str s18, [x23, x28]\n" + "mov v18.16b, v13.16b\n" + "fmla v2.4s, v21.4s, v9.4s\n" + "ldr x23, [%[outptrs], 8]\n" + "fmla v15.4s, v21.4s, v12.4s\n" + "ldr s24, [x20, x27]\n" + "fmla v23.4s, v20.4s, v7.4s\n" + "ldr x20, [%[inptrs], 168]\n" + "fmla v17.4s, v20.4s, v10.4s\n" + "fmla v14.4s, v20.4s, v8.4s\n" + "fmla v2.4s, v20.4s, v11.4s\n" + "fmla v3.4s, v20.4s, v9.4s\n" + "fmla v18.4s, v20.4s, v12.4s\n" + "ldr s25, [x19, x27]\n" + "fmla v0.4s, v27.4s, v6.4s\n" + "ldr s29, [x18, x27]\n" + "fmla v14.4s, v26.4s, v10.4s\n" + "ldr x19, [%[inptrs], 128]\n" + "fmla v3.4s, v26.4s, v11.4s\n" + "ldr s27, [x17, x27]\n" + "fmla v19.4s, v30.4s, v5.4s\n" + "ldr x18, [%[inptrs], 88]\n" + "fmla v0.4s, v30.4s, v8.4s\n" + "fmla v1.4s, v30.4s, v6.4s\n" + "fmla v16.4s, v30.4s, v9.4s\n" + "ldr s28, [x22, x27]\n" + "fmla v22.4s, v24.4s, v4.4s\n" + "ldr x22, [%[inptrs], 256]\n" + "fmla v19.4s, v24.4s, v7.4s\n" + "fmla v17.4s, v24.4s, v5.4s\n" + "fmla v0.4s, v24.4s, v10.4s\n" + "fmla v1.4s, v24.4s, v8.4s\n" + "fmla v2.4s, v24.4s, v6.4s\n" + "fmla v16.4s, v24.4s, v11.4s\n" + "str s22, [x24, x28]\n" + "mov v21.16b, v13.16b\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "ldr x24, [%[outptrs], 40]\n" + "fmla v23.4s, v25.4s, v4.4s\n" + "fmla v17.4s, v25.4s, v7.4s\n" + "fmla v21.4s, v24.4s, v12.4s\n" + "ldr s22, [x21, x27]\n" + "fmla v14.4s, v25.4s, v5.4s\n" + "ldr x21, [%[inptrs], 216]\n" + "fmla v1.4s, v25.4s, v10.4s\n" + "fmla v2.4s, v25.4s, v8.4s\n" + "str s23, [x23, x28]\n" + "mov v24.16b, v13.16b\n" + "mov v20.16b, v13.16b\n" + "ldr x23, [%[outptrs], 16]\n" + "fmla v3.4s, v25.4s, v6.4s\n" + "fmla v15.4s, v25.4s, v11.4s\n" + "fmla v18.4s, v25.4s, v9.4s\n" + "fmla v24.4s, v25.4s, v12.4s\n" + "fmla v14.4s, v29.4s, v7.4s\n" + "ldr s30, [x20, x27]\n" + "fmla v2.4s, v29.4s, v10.4s\n" + "ldr x20, [%[inptrs], 176]\n" + "fmla v3.4s, v29.4s, v8.4s\n" + "fmla v0.4s, v28.4s, v5.4s\n" + "fmla v18.4s, v29.4s, v11.4s\n" + "ldr s31, [x19, x27]\n" + "fmla v16.4s, v28.4s, v6.4s\n" + "ldr s26, [x18, x27]\n" + "fmla v19.4s, v22.4s, v4.4s\n" + "ldr x19, [%[inptrs], 136]\n" + "fmla v3.4s, v27.4s, v10.4s\n" + "ldr s23, [x22, x27]\n" + "fmla v0.4s, v22.4s, v7.4s\n" + "ldr x22, [%[inptrs], 264]\n" + "fmla v1.4s, v22.4s, v5.4s\n" + "fmla v16.4s, v22.4s, v8.4s\n" + "str s19, [x25, x28]\n" + "fmla v15.4s, v22.4s, v6.4s\n" + "fmla v21.4s, v22.4s, v9.4s\n" + "ldr s27, [x21, x27]\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "ldr s28, [x20, x27]\n" + "fmla v1.4s, v30.4s, v7.4s\n" + "ldr x21, [%[inptrs], 224]\n" + "fmla v2.4s, v30.4s, v5.4s\n" + "ldr x20, [%[inptrs], 184]\n" + "fmla v16.4s, v30.4s, v10.4s\n" + "ldr x25, [%[outptrs], 72]\n" + "str s17, [x24, x28]\n" + "fmla v15.4s, v30.4s, v8.4s\n" + "fmla v18.4s, v30.4s, v6.4s\n" + "ldr s22, [x19, x27]\n" + "fmla v21.4s, v30.4s, v11.4s\n" + "ldr x24, [%[outptrs], 48]\n" + "fmla v24.4s, v30.4s, v9.4s\n" + "fmla v20.4s, v30.4s, v12.4s\n" + "fmla v14.4s, v31.4s, v4.4s\n" + "ldr s30, [x22, x27]\n" + "fmla v2.4s, v31.4s, v7.4s\n" + "ldr s19, [x21, x27]\n" + "fmla v3.4s, v31.4s, v5.4s\n" + "ldr x22, [%[inptrs], 272]\n" + "fmla v15.4s, v31.4s, v10.4s\n" + "ldr x21, [%[inptrs], 232]\n" + "str s14, [x23, x28]\n" + "fmla v18.4s, v31.4s, v8.4s\n" + "fmla v24.4s, v31.4s, v11.4s\n" + "ldr s31, [x20, x27]\n" + "fmla v3.4s, v26.4s, v7.4s\n" + "ldr s17, [x22, x27]\n" + "fmla v0.4s, v23.4s, v4.4s\n" + "ldr x22, [%[inptrs], 280]\n" + "fmla v18.4s, v26.4s, v10.4s\n" + "ldr s14, [x21, x27]\n" + "fmla v16.4s, v23.4s, v5.4s\n" + "ldr x23, [%[outptrs], 24]\n" + "fmla v21.4s, v23.4s, v6.4s\n" + "ldr s26, [x22, x27]\n" + "str s0, [x26, x28]\n" + "fmla v1.4s, v27.4s, v4.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "ldr x26, [%[outptrs], 104]\n" + "fmla v16.4s, v27.4s, v7.4s\n" + "add x27, x27, #4\n" + "fmla v21.4s, v27.4s, v8.4s\n" + "fmla v24.4s, v27.4s, v6.4s\n" + "str s1, [x25, x28]\n" + "fmla v20.4s, v27.4s, v9.4s\n" + "fmla v2.4s, v28.4s, v4.4s\n" + "ldr x25, [%[outptrs], 80]\n" + "fmla v15.4s, v28.4s, v7.4s\n" + "fmla v18.4s, v28.4s, v5.4s\n" + "fmla v21.4s, v28.4s, v10.4s\n" + "fmla v24.4s, v28.4s, v8.4s\n" + "fmla v20.4s, v28.4s, v11.4s\n" + "fmla v3.4s, v22.4s, v4.4s\n" + "str s2, [x24, x28]\n" + "fmla v16.4s, v30.4s, v4.4s\n" + "fmla v18.4s, v22.4s, v7.4s\n" + "ldr x24, [%[outptrs], 56]\n" + "fmla v24.4s, v22.4s, v10.4s\n" + "fmla v21.4s, v30.4s, v5.4s\n" + "str s3, [x23, x28]\n" + "fmla v20.4s, v30.4s, v6.4s\n" + "str s16, [x26, x28]\n" + "fmla v15.4s, v19.4s, v4.4s\n" + "fmla v18.4s, v31.4s, v4.4s\n" + "ldr x26, [%[outptrs], 112]\n" + "fmla v21.4s, v19.4s, v7.4s\n" + "fmla v24.4s, v19.4s, v5.4s\n" + "fmla v20.4s, v19.4s, v8.4s\n" + "str s15, [x25, x28]\n" + "str s18, [x24, x28]\n" + "ldr x25, [%[outptrs], 88]\n" + "fmla v24.4s, v31.4s, v7.4s\n" + "fmla v21.4s, v17.4s, v4.4s\n" + "fmla v20.4s, v31.4s, v10.4s\n" + "str s21, [x26, x28]\n" + "fmla v20.4s, v17.4s, v5.4s\n" + "ldr x26, [%[outptrs], 120]\n" + "fmla v24.4s, v14.4s, v4.4s\n" + "fmla v20.4s, v14.4s, v7.4s\n" + "str s24, [x25, x28]\n" + "fmla v20.4s, v26.4s, v4.4s\n" + "str s20, [x26, x28]\n" + "add x28, x28, #4\n" + "7:\n" + : [wbptr] "+r" (weight_bias_ptr) + : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + ); +} + template <> template <> void Conv::execute_tile( @@ -2372,6 +3521,1223 @@ void Conv::execute_tile( ); } +template <> +template <> +void Conv::execute_tile( + int n_channels, + const void *weight_bias_ptr, + const float *inptrs[6][6], + float *outptrs[4][4] +) +{ + __asm __volatile( + "mov x27, xzr\n" + "mov x28, xzr\n" + "and x19, %[n_channels], #3\n" + "lsr x26, %[n_channels], #2\n" + "cbz x26, 4f\n" + "1:\n" + "ldr q25, [%[wbptr]]\n" + "ldr x25, [%[inptrs], 0]\n" + "mov v2.16b, v25.16b\n" + "ldr q22, [%[wbptr], #16]\n" + "mov v16.16b, v25.16b\n" + "ldr q9, [%[wbptr], #32]\n" + "mov v18.16b, v25.16b\n" + "ldr q8, [%[wbptr], #48]\n" + "mov v13.16b, v25.16b\n" + "ldr q19, [%[wbptr], #64]\n" + "mov v0.16b, v25.16b\n" + "ldr q7, [%[wbptr], #80]\n" + "mov v17.16b, v25.16b\n" + "ldr q6, [%[wbptr], #96]\n" + "mov v14.16b, v25.16b\n" + "ldr q5, [%[wbptr], #112]\n" + "mov v12.16b, v25.16b\n" + "ldr q4, [%[wbptr], #128]\n" + "mov v15.16b, v25.16b\n" + "ldr q3, [%[wbptr], #144]\n" + "ldr q27, [x25, x27]\n" + "ldr x17, [%[inptrs], 48]\n" + "fmla v2.4s, v27.4s, v22.4s\n" + "ldr x25, [%[inptrs], 8]\n" + "ldr q26, [x17, x27]\n" + "ldr x24, [%[inptrs], 96]\n" + "fmla v16.4s, v26.4s, v22.4s\n" + "ldr q31, [x25, x27]\n" + "ldr q28, [x24, x27]\n" + "ldr x17, [%[inptrs], 56]\n" + "fmla v2.4s, v26.4s, v19.4s\n" + "ldr x25, [%[inptrs], 16]\n" + "ldr q29, [x17, x27]\n" + "ldr x18, [%[inptrs], 144]\n" + "ldr x24, [%[inptrs], 104]\n" + "subs x26, x26, #1\n" + "ldr q30, [x25, x27]\n" + "ldr q27, [x18, x27]\n" + "ldr q21, [x24, x27]\n" + "fmla v2.4s, v31.4s, v9.4s\n" + "beq 3f\n" + "2:\n" + "mov v1.16b, v25.16b\n" + "ldr x17, [%[inptrs], 64]\n" + "mov v10.16b, v25.16b\n" + "ldr x25, [%[inptrs], 24]\n" + "fmla v18.4s, v31.4s, v22.4s\n" + "ldr q23, [x17, x27]\n" + "fmla v2.4s, v28.4s, v5.4s\n" + "ldr x15, [%[inptrs], 192]\n" + "fmla v16.4s, v28.4s, v19.4s\n" + "ldr x18, [%[inptrs], 152]\n" + "fmla v13.4s, v28.4s, v22.4s\n" + "ldr q26, [x25, x27]\n" + "fmla v18.4s, v29.4s, v19.4s\n" + "ldr x24, [%[inptrs], 112]\n" + "fmla v2.4s, v29.4s, v7.4s\n" + "ldr x17, [%[inptrs], 72]\n" + "fmla v16.4s, v29.4s, v9.4s\n" + "ldr x25, [%[inptrs], 32]\n" + "fmla v0.4s, v29.4s, v22.4s\n" + "ldr q28, [x15, x27]\n" + "fmla v18.4s, v30.4s, v9.4s\n" + "ldr x16, [%[inptrs], 240]\n" + "fmla v2.4s, v30.4s, v8.4s\n" + "ldr x15, [%[inptrs], 200]\n" + "fmla v17.4s, v30.4s, v22.4s\n" + "ldr q29, [x18, x27]\n" + "fmla v16.4s, v27.4s, v5.4s\n" + "ldr x18, [%[inptrs], 160]\n" + "fmla v13.4s, v27.4s, v19.4s\n" + "ldr x20, [%[outptrs], 0]\n" + "fmla v14.4s, v27.4s, v22.4s\n" + "ldr q20, [x24, x27]\n" + "fmla v2.4s, v21.4s, v4.4s\n" + "ldr x24, [%[inptrs], 120]\n" + "fmla v16.4s, v21.4s, v7.4s\n" + "ldr x21, [%[outptrs], 32]\n" + "fmla v18.4s, v21.4s, v5.4s\n" + "ldr x22, [%[outptrs], 64]\n" + "fmla v13.4s, v21.4s, v9.4s\n" + "ldr x23, [%[outptrs], 96]\n" + "fmla v0.4s, v21.4s, v19.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v12.4s, v21.4s, v22.4s\n" + "ldr q24, [x17, x27]\n" + "fmla v2.4s, v23.4s, v6.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v16.4s, v23.4s, v8.4s\n" + "ldr x17, [%[inptrs], 80]\n" + "fmla v18.4s, v23.4s, v7.4s\n" + "subs x26, x26, #1\n" + "fmla v0.4s, v23.4s, v9.4s\n" + "fmla v17.4s, v23.4s, v19.4s\n" + "fmla v15.4s, v23.4s, v22.4s\n" + "ldr q23, [x25, x27]\n" + "fmla v1.4s, v26.4s, v22.4s\n" + "ldr x25, [%[inptrs], 40]\n" + "fmla v18.4s, v26.4s, v8.4s\n" + "fmla v13.4s, v28.4s, v5.4s\n" + "fmla v17.4s, v26.4s, v9.4s\n" + "ldr q30, [x16, x27]\n" + "fmla v14.4s, v28.4s, v19.4s\n" + "ldr q26, [x15, x27]\n" + "fmla v16.4s, v29.4s, v4.4s\n" + "ldr x16, [%[inptrs], 248]\n" + "fmla v13.4s, v29.4s, v7.4s\n" + "ldr x15, [%[inptrs], 208]\n" + "fmla v0.4s, v29.4s, v5.4s\n" + "fmla v12.4s, v29.4s, v19.4s\n" + "fmla v14.4s, v29.4s, v9.4s\n" + "fmla v10.4s, v29.4s, v22.4s\n" + "mov v11.16b, v25.16b\n" + "fmla v2.4s, v20.4s, v3.4s\n" + "fmla v16.4s, v20.4s, v6.4s\n" + "fmla v18.4s, v20.4s, v4.4s\n" + "fmla v13.4s, v20.4s, v8.4s\n" + "fmla v0.4s, v20.4s, v7.4s\n" + "fmla v17.4s, v20.4s, v5.4s\n" + "fmla v12.4s, v20.4s, v9.4s\n" + "fmla v15.4s, v20.4s, v19.4s\n" + "fmla v11.4s, v20.4s, v22.4s\n" + "mov v21.16b, v25.16b\n" + "fmla v18.4s, v24.4s, v6.4s\n" + "fmla v0.4s, v24.4s, v8.4s\n" + "fmla v1.4s, v24.4s, v19.4s\n" + "fmla v17.4s, v24.4s, v7.4s\n" + "fmla v14.4s, v30.4s, v5.4s\n" + "mov v20.16b, v25.16b\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "fmla v21.4s, v24.4s, v22.4s\n" + "ldr q27, [x18, x27]\n" + "fmla v1.4s, v23.4s, v9.4s\n" + "ldr x18, [%[inptrs], 168]\n" + "fmla v17.4s, v23.4s, v8.4s\n" + "ldr q30, [x24, x27]\n" + "fmla v13.4s, v26.4s, v4.4s\n" + "ldr x24, [%[inptrs], 128]\n" + "fmla v14.4s, v26.4s, v7.4s\n" + "fmla v12.4s, v26.4s, v5.4s\n" + "fmla v10.4s, v26.4s, v19.4s\n" + "ldr q31, [x17, x27]\n" + "fmla v16.4s, v27.4s, v3.4s\n" + "ldr x17, [%[inptrs], 88]\n" + "fmla v13.4s, v27.4s, v6.4s\n" + "fmla v0.4s, v27.4s, v4.4s\n" + "fmla v14.4s, v27.4s, v8.4s\n" + "fmla v12.4s, v27.4s, v7.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "fmla v10.4s, v27.4s, v9.4s\n" + "fmla v11.4s, v27.4s, v19.4s\n" + "fmla v20.4s, v27.4s, v22.4s\n" + "mov v24.16b, v25.16b\n" + "mov v23.16b, v25.16b\n" + "fmla v18.4s, v30.4s, v3.4s\n" + "fmla v0.4s, v30.4s, v6.4s\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "fmla v12.4s, v30.4s, v8.4s\n" + "fmla v15.4s, v30.4s, v7.4s\n" + "fmla v1.4s, v30.4s, v5.4s\n" + "fmla v11.4s, v30.4s, v9.4s\n" + "fmla v21.4s, v30.4s, v19.4s\n" + "fmla v24.4s, v30.4s, v22.4s\n" + "ldr q25, [x25, x27]\n" + "fmla v17.4s, v31.4s, v6.4s\n" + "ldr x25, [%[inptrs], 0]\n" + "fmla v15.4s, v31.4s, v8.4s\n" + "fmla v1.4s, v31.4s, v7.4s\n" + "fmla v21.4s, v31.4s, v9.4s\n" + "ldr q26, [x16, x27]\n" + "fmla v14.4s, v26.4s, v4.4s\n" + "ldr x16, [%[inptrs], 256]\n" + "fmla v10.4s, v26.4s, v5.4s\n" + "ldr q31, [x15, x27]\n" + "fmla v1.4s, v25.4s, v8.4s\n" + "ldr q29, [x18, x27]\n" + "fmla v13.4s, v31.4s, v3.4s\n" + "ldr x15, [%[inptrs], 216]\n" + "fmla v14.4s, v31.4s, v6.4s\n" + "ldr x18, [%[inptrs], 176]\n" + "fmla v12.4s, v31.4s, v4.4s\n" + "fmla v10.4s, v31.4s, v7.4s\n" + "fmla v11.4s, v31.4s, v5.4s\n" + "fmla v20.4s, v31.4s, v19.4s\n" + "fmla v0.4s, v29.4s, v3.4s\n" + "ldr q28, [x24, x27]\n" + "fmla v15.4s, v29.4s, v4.4s\n" + "ldr x24, [%[inptrs], 136]\n" + "fmla v12.4s, v29.4s, v6.4s\n" + "fmla v10.4s, v29.4s, v8.4s\n" + "fmla v11.4s, v29.4s, v7.4s\n" + "fmla v21.4s, v29.4s, v5.4s\n" + "fmla v20.4s, v29.4s, v9.4s\n" + "fmla v24.4s, v29.4s, v19.4s\n" + "fmla v23.4s, v29.4s, v22.4s\n" + "ldr q25, [x17, x27]\n" + "fmla v17.4s, v28.4s, v3.4s\n" + "ldr q29, [x16, x27]\n" + "fmla v15.4s, v28.4s, v6.4s\n" + "ldr x16, [%[inptrs], 264]\n" + "fmla v1.4s, v28.4s, v4.4s\n" + "ldr x17, [%[inptrs], 48]\n" + "fmla v11.4s, v28.4s, v8.4s\n" + "fmla v21.4s, v28.4s, v7.4s\n" + "fmla v24.4s, v28.4s, v9.4s\n" + "ldr q22, [x15, x27]\n" + "fmla v14.4s, v29.4s, v3.4s\n" + "ldr x15, [%[inptrs], 224]\n" + "fmla v1.4s, v25.4s, v6.4s\n" + "fmla v10.4s, v29.4s, v4.4s\n" + "fmla v21.4s, v25.4s, v8.4s\n" + "ldr q27, [x18, x27]\n" + "fmla v20.4s, v29.4s, v5.4s\n" + "ldr q26, [x24, x27]\n" + "fmla v12.4s, v22.4s, v3.4s\n" + "ldr x18, [%[inptrs], 184]\n" + "fmla v10.4s, v22.4s, v6.4s\n" + "ldr x24, [%[inptrs], 96]\n" + "fmla v11.4s, v22.4s, v4.4s\n" + "fmla v24.4s, v22.4s, v5.4s\n" + "fmla v20.4s, v22.4s, v7.4s\n" + "fmla v23.4s, v22.4s, v19.4s\n" + "fmla v15.4s, v27.4s, v3.4s\n" + "ldr q25, [x16, x27]\n" + "fmla v21.4s, v27.4s, v4.4s\n" + "ldr q31, [x15, x27]\n" + "fmla v11.4s, v27.4s, v6.4s\n" + "ldr x16, [%[inptrs], 272]\n" + "fmla v20.4s, v27.4s, v8.4s\n" + "ldr x15, [%[inptrs], 232]\n" + "fmla v24.4s, v27.4s, v7.4s\n" + "fmla v23.4s, v27.4s, v9.4s\n" + "fmla v1.4s, v26.4s, v3.4s\n" + "ldr q22, [x18, x27]\n" + "fmla v21.4s, v26.4s, v6.4s\n" + "ldr q19, [x16, x27]\n" + "fmla v10.4s, v25.4s, v3.4s\n" + "ldr x16, [%[inptrs], 280]\n" + "fmla v24.4s, v26.4s, v8.4s\n" + "ldr q28, [x15, x27]\n" + "fmla v20.4s, v25.4s, v4.4s\n" + "ldr x18, [%[inptrs], 144]\n" + "fmla v23.4s, v25.4s, v5.4s\n" + "ldr q30, [x16, x27]\n" + "fmla v11.4s, v31.4s, v3.4s\n" + "add x27, x27, #16\n" + "fmla v24.4s, v31.4s, v4.4s\n" + "ldr q27, [x25, x27]\n" + "fmla v20.4s, v31.4s, v6.4s\n" + "ldr x25, [%[inptrs], 8]\n" + "fmla v23.4s, v31.4s, v7.4s\n" + "movi v29.16b, #0\n" + "fmla v21.4s, v22.4s, v3.4s\n" + "ldr q26, [x17, x27]\n" + "fmla v24.4s, v22.4s, v6.4s\n" + "ldr x17, [%[inptrs], 56]\n" + "fmla v20.4s, v19.4s, v3.4s\n" + "fmax v2.4s, v2.4s, v29.4s\n" + "fmla v23.4s, v22.4s, v8.4s\n" + "ldr q25, [%[wbptr]]\n" + "fmax v18.4s, v18.4s, v29.4s\n" + "ldr q22, [%[wbptr], #16]\n" + "str q2, [x20, x28]\n" + "fmla v24.4s, v28.4s, v3.4s\n" + "fmax v17.4s, v17.4s, v29.4s\n" + "ldr q9, [%[wbptr], #32]\n" + "fmla v23.4s, v19.4s, v4.4s\n" + "ldr q8, [%[wbptr], #48]\n" + "fmax v1.4s, v1.4s, v29.4s\n" + "ldr q19, [%[wbptr], #64]\n" + "fmax v16.4s, v16.4s, v29.4s\n" + "ldr x20, [%[outptrs], 8]\n" + "fmax v0.4s, v0.4s, v29.4s\n" + "fmax v15.4s, v15.4s, v29.4s\n" + "str q18, [x20, x28]\n" + "fmla v23.4s, v28.4s, v6.4s\n" + "str q16, [x21, x28]\n" + "fmax v21.4s, v21.4s, v29.4s\n" + "fmax v13.4s, v13.4s, v29.4s\n" + "ldr q7, [%[wbptr], #80]\n" + "fmax v12.4s, v12.4s, v29.4s\n" + "ldr q5, [%[wbptr], #112]\n" + "fmla v23.4s, v30.4s, v3.4s\n" + "ldr q6, [%[wbptr], #96]\n" + "str q13, [x22, x28]\n" + "fmax v11.4s, v11.4s, v29.4s\n" + "fmax v24.4s, v24.4s, v29.4s\n" + "ldr q4, [%[wbptr], #128]\n" + "fmax v14.4s, v14.4s, v29.4s\n" + "ldr q31, [x25, x27]\n" + "fmax v10.4s, v10.4s, v29.4s\n" + "ldr q3, [%[wbptr], #144]\n" + "fmax v20.4s, v20.4s, v29.4s\n" + "ldr q28, [x24, x27]\n" + "str q14, [x23, x28]\n" + "fmax v23.4s, v23.4s, v29.4s\n" + "mov v2.16b, v25.16b\n" + "ldr q29, [x17, x27]\n" + "ldr x20, [%[outptrs], 16]\n" + "ldr x21, [%[outptrs], 40]\n" + "ldr x22, [%[outptrs], 72]\n" + "ldr x23, [%[outptrs], 104]\n" + "ldr x25, [%[inptrs], 16]\n" + "ldr x24, [%[inptrs], 104]\n" + "str q17, [x20, x28]\n" + "mov v16.16b, v25.16b\n" + "str q0, [x21, x28]\n" + "mov v18.16b, v25.16b\n" + "str q12, [x22, x28]\n" + "mov v13.16b, v25.16b\n" + "str q10, [x23, x28]\n" + "mov v0.16b, v25.16b\n" + "fmla v2.4s, v27.4s, v22.4s\n" + "ldr q30, [x25, x27]\n" + "fmla v16.4s, v26.4s, v22.4s\n" + "ldr x20, [%[outptrs], 24]\n" + "mov v17.16b, v25.16b\n" + "ldr x21, [%[outptrs], 48]\n" + "str q1, [x20, x28]\n" + "mov v14.16b, v25.16b\n" + "str q15, [x21, x28]\n" + "mov v12.16b, v25.16b\n" + "mov v15.16b, v25.16b\n" + "ldr x21, [%[outptrs], 56]\n" + "fmla v2.4s, v26.4s, v19.4s\n" + "ldr q27, [x18, x27]\n" + "str q21, [x21, x28]\n" + "ldr x22, [%[outptrs], 80]\n" + "ldr q21, [x24, x27]\n" + "ldr x23, [%[outptrs], 112]\n" + "str q11, [x22, x28]\n" + "fmla v2.4s, v31.4s, v9.4s\n" + "str q20, [x23, x28]\n" + "ldr x22, [%[outptrs], 88]\n" + "ldr x23, [%[outptrs], 120]\n" + "str q24, [x22, x28]\n" + "str q23, [x23, x28]\n" + "add x28, x28, #16\n" + "bne 2b\n" + "3:\n" + "mov v1.16b, v25.16b\n" + "ldr x17, [%[inptrs], 64]\n" + "mov v10.16b, v25.16b\n" + "ldr x25, [%[inptrs], 24]\n" + "mov v11.16b, v25.16b\n" + "ldr x15, [%[inptrs], 192]\n" + "fmla v18.4s, v31.4s, v22.4s\n" + "ldr q23, [x17, x27]\n" + "fmla v2.4s, v28.4s, v5.4s\n" + "ldr x18, [%[inptrs], 152]\n" + "fmla v16.4s, v28.4s, v19.4s\n" + "ldr x24, [%[inptrs], 112]\n" + "fmla v13.4s, v28.4s, v22.4s\n" + "ldr q26, [x25, x27]\n" + "fmla v18.4s, v29.4s, v19.4s\n" + "ldr x17, [%[inptrs], 72]\n" + "fmla v2.4s, v29.4s, v7.4s\n" + "ldr x25, [%[inptrs], 32]\n" + "fmla v16.4s, v29.4s, v9.4s\n" + "ldr x16, [%[inptrs], 240]\n" + "fmla v0.4s, v29.4s, v22.4s\n" + "ldr q28, [x15, x27]\n" + "fmla v18.4s, v30.4s, v9.4s\n" + "ldr x15, [%[inptrs], 200]\n" + "fmla v2.4s, v30.4s, v8.4s\n" + "ldr x20, [%[outptrs], 0]\n" + "fmla v17.4s, v30.4s, v22.4s\n" + "ldr q29, [x18, x27]\n" + "fmla v16.4s, v27.4s, v5.4s\n" + "ldr x18, [%[inptrs], 160]\n" + "fmla v13.4s, v27.4s, v19.4s\n" + "ldr x21, [%[outptrs], 32]\n" + "fmla v14.4s, v27.4s, v22.4s\n" + "ldr q20, [x24, x27]\n" + "fmla v2.4s, v21.4s, v4.4s\n" + "ldr x24, [%[inptrs], 120]\n" + "fmla v16.4s, v21.4s, v7.4s\n" + "ldr x22, [%[outptrs], 64]\n" + "fmla v18.4s, v21.4s, v5.4s\n" + "ldr x23, [%[outptrs], 96]\n" + "fmla v13.4s, v21.4s, v9.4s\n" + "add %[wbptr], %[wbptr], #160\n" + "fmla v0.4s, v21.4s, v19.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v12.4s, v21.4s, v22.4s\n" + "ldr q24, [x17, x27]\n" + "fmla v2.4s, v23.4s, v6.4s\n" + "ldr x17, [%[inptrs], 80]\n" + "fmla v16.4s, v23.4s, v8.4s\n" + "fmla v18.4s, v23.4s, v7.4s\n" + "fmla v0.4s, v23.4s, v9.4s\n" + "fmla v17.4s, v23.4s, v19.4s\n" + "fmla v15.4s, v23.4s, v22.4s\n" + "ldr q23, [x25, x27]\n" + "fmla v1.4s, v26.4s, v22.4s\n" + "ldr x25, [%[inptrs], 40]\n" + "fmla v18.4s, v26.4s, v8.4s\n" + "fmla v13.4s, v28.4s, v5.4s\n" + "fmla v17.4s, v26.4s, v9.4s\n" + "ldr q30, [x16, x27]\n" + "fmla v14.4s, v28.4s, v19.4s\n" + "ldr q26, [x15, x27]\n" + "fmla v16.4s, v29.4s, v4.4s\n" + "ldr x16, [%[inptrs], 248]\n" + "fmla v13.4s, v29.4s, v7.4s\n" + "ldr x15, [%[inptrs], 208]\n" + "fmla v0.4s, v29.4s, v5.4s\n" + "fmla v12.4s, v29.4s, v19.4s\n" + "fmla v14.4s, v29.4s, v9.4s\n" + "fmla v10.4s, v29.4s, v22.4s\n" + "mov v21.16b, v25.16b\n" + "fmla v2.4s, v20.4s, v3.4s\n" + "fmla v16.4s, v20.4s, v6.4s\n" + "fmla v18.4s, v20.4s, v4.4s\n" + "fmla v13.4s, v20.4s, v8.4s\n" + "fmla v0.4s, v20.4s, v7.4s\n" + "fmla v17.4s, v20.4s, v5.4s\n" + "fmla v12.4s, v20.4s, v9.4s\n" + "fmla v15.4s, v20.4s, v19.4s\n" + "fmla v11.4s, v20.4s, v22.4s\n" + "mov v20.16b, v25.16b\n" + "fmla v18.4s, v24.4s, v6.4s\n" + "fmla v0.4s, v24.4s, v8.4s\n" + "fmla v1.4s, v24.4s, v19.4s\n" + "fmla v17.4s, v24.4s, v7.4s\n" + "fmla v21.4s, v24.4s, v22.4s\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "ldr q27, [x18, x27]\n" + "fmla v14.4s, v30.4s, v5.4s\n" + "ldr q30, [x24, x27]\n" + "fmla v1.4s, v23.4s, v9.4s\n" + "ldr x18, [%[inptrs], 168]\n" + "fmla v17.4s, v23.4s, v8.4s\n" + "ldr q31, [x17, x27]\n" + "fmla v13.4s, v26.4s, v4.4s\n" + "ldr x24, [%[inptrs], 128]\n" + "fmla v14.4s, v26.4s, v7.4s\n" + "ldr x17, [%[inptrs], 88]\n" + "fmla v12.4s, v26.4s, v5.4s\n" + "fmla v10.4s, v26.4s, v19.4s\n" + "mov v24.16b, v25.16b\n" + "mov v23.16b, v25.16b\n" + "fmla v16.4s, v27.4s, v3.4s\n" + "fmla v13.4s, v27.4s, v6.4s\n" + "fmla v0.4s, v27.4s, v4.4s\n" + "fmla v14.4s, v27.4s, v8.4s\n" + "fmla v12.4s, v27.4s, v7.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "fmla v10.4s, v27.4s, v9.4s\n" + "fmla v11.4s, v27.4s, v19.4s\n" + "fmla v20.4s, v27.4s, v22.4s\n" + "ldr q25, [x25, x27]\n" + "fmla v18.4s, v30.4s, v3.4s\n" + "fmla v0.4s, v30.4s, v6.4s\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "fmla v12.4s, v30.4s, v8.4s\n" + "fmla v15.4s, v30.4s, v7.4s\n" + "fmla v1.4s, v30.4s, v5.4s\n" + "fmla v11.4s, v30.4s, v9.4s\n" + "fmla v21.4s, v30.4s, v19.4s\n" + "fmla v24.4s, v30.4s, v22.4s\n" + "ldr q26, [x16, x27]\n" + "fmla v17.4s, v31.4s, v6.4s\n" + "ldr x16, [%[inptrs], 256]\n" + "fmla v15.4s, v31.4s, v8.4s\n" + "fmla v1.4s, v31.4s, v7.4s\n" + "fmla v21.4s, v31.4s, v9.4s\n" + "ldr q31, [x15, x27]\n" + "fmla v14.4s, v26.4s, v4.4s\n" + "ldr x15, [%[inptrs], 216]\n" + "fmla v10.4s, v26.4s, v5.4s\n" + "ldr q29, [x18, x27]\n" + "fmla v1.4s, v25.4s, v8.4s\n" + "ldr q28, [x24, x27]\n" + "fmla v13.4s, v31.4s, v3.4s\n" + "ldr x18, [%[inptrs], 176]\n" + "fmla v14.4s, v31.4s, v6.4s\n" + "ldr x24, [%[inptrs], 136]\n" + "fmla v12.4s, v31.4s, v4.4s\n" + "fmla v10.4s, v31.4s, v7.4s\n" + "fmla v11.4s, v31.4s, v5.4s\n" + "fmla v20.4s, v31.4s, v19.4s\n" + "fmla v0.4s, v29.4s, v3.4s\n" + "ldr q25, [x17, x27]\n" + "fmla v15.4s, v29.4s, v4.4s\n" + "fmla v21.4s, v29.4s, v5.4s\n" + "fmla v12.4s, v29.4s, v6.4s\n" + "fmla v10.4s, v29.4s, v8.4s\n" + "fmla v11.4s, v29.4s, v7.4s\n" + "fmla v20.4s, v29.4s, v9.4s\n" + "fmla v24.4s, v29.4s, v19.4s\n" + "fmla v23.4s, v29.4s, v22.4s\n" + "fmla v17.4s, v28.4s, v3.4s\n" + "ldr q29, [x16, x27]\n" + "fmla v15.4s, v28.4s, v6.4s\n" + "ldr q22, [x15, x27]\n" + "fmla v1.4s, v28.4s, v4.4s\n" + "ldr x16, [%[inptrs], 264]\n" + "fmla v11.4s, v28.4s, v8.4s\n" + "ldr x15, [%[inptrs], 224]\n" + "fmla v21.4s, v28.4s, v7.4s\n" + "fmla v24.4s, v28.4s, v9.4s\n" + "fmla v14.4s, v29.4s, v3.4s\n" + "ldr q27, [x18, x27]\n" + "fmla v1.4s, v25.4s, v6.4s\n" + "ldr x18, [%[inptrs], 184]\n" + "fmla v10.4s, v29.4s, v4.4s\n" + "fmla v20.4s, v29.4s, v5.4s\n" + "fmla v21.4s, v25.4s, v8.4s\n" + "ldr q26, [x24, x27]\n" + "fmla v12.4s, v22.4s, v3.4s\n" + "ldr q25, [x16, x27]\n" + "fmla v11.4s, v22.4s, v4.4s\n" + "ldr x16, [%[inptrs], 272]\n" + "fmla v10.4s, v22.4s, v6.4s\n" + "fmla v20.4s, v22.4s, v7.4s\n" + "fmla v24.4s, v22.4s, v5.4s\n" + "fmla v23.4s, v22.4s, v19.4s\n" + "fmla v15.4s, v27.4s, v3.4s\n" + "ldr q31, [x15, x27]\n" + "fmla v11.4s, v27.4s, v6.4s\n" + "ldr q22, [x18, x27]\n" + "fmla v21.4s, v27.4s, v4.4s\n" + "ldr x15, [%[inptrs], 232]\n" + "fmla v20.4s, v27.4s, v8.4s\n" + "fmla v24.4s, v27.4s, v7.4s\n" + "fmla v23.4s, v27.4s, v9.4s\n" + "ldr q19, [x16, x27]\n" + "fmla v1.4s, v26.4s, v3.4s\n" + "ldr q28, [x15, x27]\n" + "fmla v21.4s, v26.4s, v6.4s\n" + "ldr x16, [%[inptrs], 280]\n" + "fmla v24.4s, v26.4s, v8.4s\n" + "fmla v10.4s, v25.4s, v3.4s\n" + "fmla v20.4s, v25.4s, v4.4s\n" + "ldr q30, [x16, x27]\n" + "fmla v23.4s, v25.4s, v5.4s\n" + "add x27, x27, #16\n" + "fmla v11.4s, v31.4s, v3.4s\n" + "fmla v21.4s, v22.4s, v3.4s\n" + "fmla v24.4s, v31.4s, v4.4s\n" + "movi v29.16b, #0\n" + "fmla v20.4s, v31.4s, v6.4s\n" + "fmla v23.4s, v31.4s, v7.4s\n" + "fmax v2.4s, v2.4s, v29.4s\n" + "fmax v18.4s, v18.4s, v29.4s\n" + "fmla v24.4s, v22.4s, v6.4s\n" + "fmax v17.4s, v17.4s, v29.4s\n" + "fmla v20.4s, v19.4s, v3.4s\n" + "fmax v1.4s, v1.4s, v29.4s\n" + "str q2, [x20, x28]\n" + "fmla v23.4s, v22.4s, v8.4s\n" + "fmax v16.4s, v16.4s, v29.4s\n" + "ldr x20, [%[outptrs], 8]\n" + "fmla v24.4s, v28.4s, v3.4s\n" + "fmax v0.4s, v0.4s, v29.4s\n" + "str q18, [x20, x28]\n" + "fmax v15.4s, v15.4s, v29.4s\n" + "str q16, [x21, x28]\n" + "fmla v23.4s, v19.4s, v4.4s\n" + "fmax v21.4s, v21.4s, v29.4s\n" + "ldr x20, [%[outptrs], 16]\n" + "fmax v13.4s, v13.4s, v29.4s\n" + "ldr x21, [%[outptrs], 40]\n" + "str q17, [x20, x28]\n" + "fmax v12.4s, v12.4s, v29.4s\n" + "str q0, [x21, x28]\n" + "fmla v23.4s, v28.4s, v6.4s\n" + "str q13, [x22, x28]\n" + "fmax v11.4s, v11.4s, v29.4s\n" + "fmax v24.4s, v24.4s, v29.4s\n" + "ldr x20, [%[outptrs], 24]\n" + "fmax v14.4s, v14.4s, v29.4s\n" + "ldr x21, [%[outptrs], 48]\n" + "str q1, [x20, x28]\n" + "fmla v23.4s, v30.4s, v3.4s\n" + "str q15, [x21, x28]\n" + "fmax v10.4s, v10.4s, v29.4s\n" + "str q14, [x23, x28]\n" + "fmax v20.4s, v20.4s, v29.4s\n" + "ldr x21, [%[outptrs], 56]\n" + "ldr x22, [%[outptrs], 72]\n" + "ldr x23, [%[outptrs], 104]\n" + "fmax v23.4s, v23.4s, v29.4s\n" + "str q21, [x21, x28]\n" + "str q12, [x22, x28]\n" + "str q10, [x23, x28]\n" + "ldr x22, [%[outptrs], 80]\n" + "ldr x23, [%[outptrs], 112]\n" + "str q11, [x22, x28]\n" + "str q20, [x23, x28]\n" + "ldr x22, [%[outptrs], 88]\n" + "ldr x23, [%[outptrs], 120]\n" + "str q24, [x22, x28]\n" + "str q23, [x23, x28]\n" + "add x28, x28, #16\n" + "4:\n" + "cbz x19, 7f\n" + "ldr s25, [%[wbptr]]\n" + "mov v2.16b, v25.16b\n" + "ldr s22, [%[wbptr], #4]\n" + "mov v16.16b, v25.16b\n" + "ldr s9, [%[wbptr], #8]\n" + "mov v18.16b, v25.16b\n" + "ldr s8, [%[wbptr], #12]\n" + "mov v13.16b, v25.16b\n" + "ldr s19, [%[wbptr], #16]\n" + "mov v0.16b, v25.16b\n" + "ldr s7, [%[wbptr], #20]\n" + "mov v17.16b, v25.16b\n" + "ldr s6, [%[wbptr], #24]\n" + "mov v14.16b, v25.16b\n" + "ldr s5, [%[wbptr], #28]\n" + "mov v12.16b, v25.16b\n" + "ldr s4, [%[wbptr], #32]\n" + "mov v15.16b, v25.16b\n" + "ldr s3, [%[wbptr], #36]\n" + "ldr x25, [%[inptrs], 0]\n" + "ldr x17, [%[inptrs], 48]\n" + "ldr x24, [%[inptrs], 96]\n" + "ldr x18, [%[inptrs], 144]\n" + "subs x19, x19, #1\n" + "ldr s27, [x25, x27]\n" + "fmla v2.4s, v27.4s, v22.4s\n" + "ldr s26, [x17, x27]\n" + "fmla v16.4s, v26.4s, v22.4s\n" + "ldr s28, [x24, x27]\n" + "ldr s27, [x18, x27]\n" + "ldr x25, [%[inptrs], 8]\n" + "ldr x17, [%[inptrs], 56]\n" + "ldr x24, [%[inptrs], 104]\n" + "ldr s31, [x25, x27]\n" + "fmla v2.4s, v26.4s, v19.4s\n" + "ldr s29, [x17, x27]\n" + "ldr s21, [x24, x27]\n" + "ldr x25, [%[inptrs], 16]\n" + "ldr s30, [x25, x27]\n" + "fmla v2.4s, v31.4s, v9.4s\n" + "beq 6f\n" + "5:\n" + "mov v1.16b, v25.16b\n" + "ldr x17, [%[inptrs], 64]\n" + "mov v10.16b, v25.16b\n" + "ldr x25, [%[inptrs], 24]\n" + "fmla v18.4s, v31.4s, v22.4s\n" + "ldr s23, [x17, x27]\n" + "fmla v2.4s, v28.4s, v5.4s\n" + "ldr x15, [%[inptrs], 192]\n" + "fmla v16.4s, v28.4s, v19.4s\n" + "ldr x18, [%[inptrs], 152]\n" + "fmla v13.4s, v28.4s, v22.4s\n" + "ldr s26, [x25, x27]\n" + "fmla v18.4s, v29.4s, v19.4s\n" + "ldr x24, [%[inptrs], 112]\n" + "fmla v2.4s, v29.4s, v7.4s\n" + "ldr x17, [%[inptrs], 72]\n" + "fmla v16.4s, v29.4s, v9.4s\n" + "ldr x25, [%[inptrs], 32]\n" + "fmla v0.4s, v29.4s, v22.4s\n" + "ldr s28, [x15, x27]\n" + "fmla v18.4s, v30.4s, v9.4s\n" + "ldr x16, [%[inptrs], 240]\n" + "fmla v2.4s, v30.4s, v8.4s\n" + "ldr x15, [%[inptrs], 200]\n" + "fmla v17.4s, v30.4s, v22.4s\n" + "ldr s29, [x18, x27]\n" + "fmla v16.4s, v27.4s, v5.4s\n" + "ldr x18, [%[inptrs], 160]\n" + "fmla v13.4s, v27.4s, v19.4s\n" + "ldr x20, [%[outptrs], 0]\n" + "fmla v14.4s, v27.4s, v22.4s\n" + "ldr s20, [x24, x27]\n" + "fmla v2.4s, v21.4s, v4.4s\n" + "ldr x24, [%[inptrs], 120]\n" + "fmla v16.4s, v21.4s, v7.4s\n" + "ldr x21, [%[outptrs], 32]\n" + "fmla v18.4s, v21.4s, v5.4s\n" + "ldr x22, [%[outptrs], 64]\n" + "fmla v13.4s, v21.4s, v9.4s\n" + "ldr x23, [%[outptrs], 96]\n" + "fmla v0.4s, v21.4s, v19.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v12.4s, v21.4s, v22.4s\n" + "ldr s24, [x17, x27]\n" + "fmla v2.4s, v23.4s, v6.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v16.4s, v23.4s, v8.4s\n" + "ldr x17, [%[inptrs], 80]\n" + "fmla v18.4s, v23.4s, v7.4s\n" + "subs x19, x19, #1\n" + "fmla v0.4s, v23.4s, v9.4s\n" + "fmla v17.4s, v23.4s, v19.4s\n" + "fmla v15.4s, v23.4s, v22.4s\n" + "ldr s23, [x25, x27]\n" + "fmla v1.4s, v26.4s, v22.4s\n" + "ldr x25, [%[inptrs], 40]\n" + "fmla v18.4s, v26.4s, v8.4s\n" + "fmla v13.4s, v28.4s, v5.4s\n" + "fmla v17.4s, v26.4s, v9.4s\n" + "ldr s30, [x16, x27]\n" + "fmla v14.4s, v28.4s, v19.4s\n" + "ldr s26, [x15, x27]\n" + "fmla v16.4s, v29.4s, v4.4s\n" + "ldr x16, [%[inptrs], 248]\n" + "fmla v13.4s, v29.4s, v7.4s\n" + "ldr x15, [%[inptrs], 208]\n" + "fmla v0.4s, v29.4s, v5.4s\n" + "fmla v12.4s, v29.4s, v19.4s\n" + "fmla v14.4s, v29.4s, v9.4s\n" + "fmla v10.4s, v29.4s, v22.4s\n" + "mov v11.16b, v25.16b\n" + "fmla v2.4s, v20.4s, v3.4s\n" + "fmla v16.4s, v20.4s, v6.4s\n" + "fmla v18.4s, v20.4s, v4.4s\n" + "fmla v13.4s, v20.4s, v8.4s\n" + "fmla v0.4s, v20.4s, v7.4s\n" + "fmla v17.4s, v20.4s, v5.4s\n" + "fmla v12.4s, v20.4s, v9.4s\n" + "fmla v15.4s, v20.4s, v19.4s\n" + "fmla v11.4s, v20.4s, v22.4s\n" + "mov v21.16b, v25.16b\n" + "fmla v18.4s, v24.4s, v6.4s\n" + "fmla v0.4s, v24.4s, v8.4s\n" + "fmla v1.4s, v24.4s, v19.4s\n" + "fmla v17.4s, v24.4s, v7.4s\n" + "fmla v14.4s, v30.4s, v5.4s\n" + "mov v20.16b, v25.16b\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "fmla v21.4s, v24.4s, v22.4s\n" + "ldr s27, [x18, x27]\n" + "fmla v1.4s, v23.4s, v9.4s\n" + "ldr x18, [%[inptrs], 168]\n" + "fmla v17.4s, v23.4s, v8.4s\n" + "ldr s30, [x24, x27]\n" + "fmla v13.4s, v26.4s, v4.4s\n" + "ldr x24, [%[inptrs], 128]\n" + "fmla v14.4s, v26.4s, v7.4s\n" + "fmla v12.4s, v26.4s, v5.4s\n" + "fmla v10.4s, v26.4s, v19.4s\n" + "ldr s31, [x17, x27]\n" + "fmla v16.4s, v27.4s, v3.4s\n" + "ldr x17, [%[inptrs], 88]\n" + "fmla v13.4s, v27.4s, v6.4s\n" + "fmla v0.4s, v27.4s, v4.4s\n" + "fmla v14.4s, v27.4s, v8.4s\n" + "fmla v12.4s, v27.4s, v7.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "fmla v10.4s, v27.4s, v9.4s\n" + "fmla v11.4s, v27.4s, v19.4s\n" + "fmla v20.4s, v27.4s, v22.4s\n" + "mov v24.16b, v25.16b\n" + "mov v23.16b, v25.16b\n" + "fmla v18.4s, v30.4s, v3.4s\n" + "fmla v0.4s, v30.4s, v6.4s\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "fmla v12.4s, v30.4s, v8.4s\n" + "fmla v15.4s, v30.4s, v7.4s\n" + "fmla v1.4s, v30.4s, v5.4s\n" + "fmla v11.4s, v30.4s, v9.4s\n" + "fmla v21.4s, v30.4s, v19.4s\n" + "fmla v24.4s, v30.4s, v22.4s\n" + "ldr s25, [x25, x27]\n" + "fmla v17.4s, v31.4s, v6.4s\n" + "ldr x25, [%[inptrs], 0]\n" + "fmla v15.4s, v31.4s, v8.4s\n" + "fmla v1.4s, v31.4s, v7.4s\n" + "fmla v21.4s, v31.4s, v9.4s\n" + "ldr s26, [x16, x27]\n" + "fmla v14.4s, v26.4s, v4.4s\n" + "ldr x16, [%[inptrs], 256]\n" + "fmla v10.4s, v26.4s, v5.4s\n" + "ldr s31, [x15, x27]\n" + "fmla v1.4s, v25.4s, v8.4s\n" + "ldr s29, [x18, x27]\n" + "fmla v13.4s, v31.4s, v3.4s\n" + "ldr x15, [%[inptrs], 216]\n" + "fmla v14.4s, v31.4s, v6.4s\n" + "ldr x18, [%[inptrs], 176]\n" + "fmla v12.4s, v31.4s, v4.4s\n" + "fmla v10.4s, v31.4s, v7.4s\n" + "fmla v11.4s, v31.4s, v5.4s\n" + "fmla v20.4s, v31.4s, v19.4s\n" + "fmla v0.4s, v29.4s, v3.4s\n" + "ldr s28, [x24, x27]\n" + "fmla v15.4s, v29.4s, v4.4s\n" + "ldr x24, [%[inptrs], 136]\n" + "fmla v12.4s, v29.4s, v6.4s\n" + "fmla v10.4s, v29.4s, v8.4s\n" + "fmla v11.4s, v29.4s, v7.4s\n" + "fmla v21.4s, v29.4s, v5.4s\n" + "fmla v20.4s, v29.4s, v9.4s\n" + "fmla v24.4s, v29.4s, v19.4s\n" + "fmla v23.4s, v29.4s, v22.4s\n" + "ldr s25, [x17, x27]\n" + "fmla v17.4s, v28.4s, v3.4s\n" + "ldr s29, [x16, x27]\n" + "fmla v15.4s, v28.4s, v6.4s\n" + "ldr x16, [%[inptrs], 264]\n" + "fmla v1.4s, v28.4s, v4.4s\n" + "ldr x17, [%[inptrs], 48]\n" + "fmla v11.4s, v28.4s, v8.4s\n" + "fmla v21.4s, v28.4s, v7.4s\n" + "fmla v24.4s, v28.4s, v9.4s\n" + "ldr s22, [x15, x27]\n" + "fmla v14.4s, v29.4s, v3.4s\n" + "ldr x15, [%[inptrs], 224]\n" + "fmla v1.4s, v25.4s, v6.4s\n" + "fmla v10.4s, v29.4s, v4.4s\n" + "fmla v21.4s, v25.4s, v8.4s\n" + "ldr s27, [x18, x27]\n" + "fmla v20.4s, v29.4s, v5.4s\n" + "ldr s26, [x24, x27]\n" + "fmla v12.4s, v22.4s, v3.4s\n" + "ldr x18, [%[inptrs], 184]\n" + "fmla v10.4s, v22.4s, v6.4s\n" + "ldr x24, [%[inptrs], 96]\n" + "fmla v11.4s, v22.4s, v4.4s\n" + "fmla v24.4s, v22.4s, v5.4s\n" + "fmla v20.4s, v22.4s, v7.4s\n" + "fmla v23.4s, v22.4s, v19.4s\n" + "fmla v15.4s, v27.4s, v3.4s\n" + "ldr s25, [x16, x27]\n" + "fmla v21.4s, v27.4s, v4.4s\n" + "ldr s31, [x15, x27]\n" + "fmla v11.4s, v27.4s, v6.4s\n" + "ldr x16, [%[inptrs], 272]\n" + "fmla v20.4s, v27.4s, v8.4s\n" + "ldr x15, [%[inptrs], 232]\n" + "fmla v24.4s, v27.4s, v7.4s\n" + "fmla v23.4s, v27.4s, v9.4s\n" + "fmla v1.4s, v26.4s, v3.4s\n" + "ldr s22, [x18, x27]\n" + "fmla v21.4s, v26.4s, v6.4s\n" + "ldr s19, [x16, x27]\n" + "fmla v10.4s, v25.4s, v3.4s\n" + "ldr x16, [%[inptrs], 280]\n" + "fmla v24.4s, v26.4s, v8.4s\n" + "ldr s28, [x15, x27]\n" + "fmla v20.4s, v25.4s, v4.4s\n" + "ldr x18, [%[inptrs], 144]\n" + "fmla v23.4s, v25.4s, v5.4s\n" + "ldr s30, [x16, x27]\n" + "fmla v11.4s, v31.4s, v3.4s\n" + "add x27, x27, #4\n" + "fmla v24.4s, v31.4s, v4.4s\n" + "ldr s27, [x25, x27]\n" + "fmla v20.4s, v31.4s, v6.4s\n" + "ldr x25, [%[inptrs], 8]\n" + "fmla v23.4s, v31.4s, v7.4s\n" + "movi v29.16b, #0\n" + "fmla v21.4s, v22.4s, v3.4s\n" + "ldr s26, [x17, x27]\n" + "fmla v24.4s, v22.4s, v6.4s\n" + "ldr x17, [%[inptrs], 56]\n" + "fmla v20.4s, v19.4s, v3.4s\n" + "fmax v2.4s, v2.4s, v29.4s\n" + "fmla v23.4s, v22.4s, v8.4s\n" + "ldr s25, [%[wbptr]]\n" + "fmax v18.4s, v18.4s, v29.4s\n" + "ldr s22, [%[wbptr], #4]\n" + "str s2, [x20, x28]\n" + "fmla v24.4s, v28.4s, v3.4s\n" + "fmax v17.4s, v17.4s, v29.4s\n" + "ldr s9, [%[wbptr], #8]\n" + "fmla v23.4s, v19.4s, v4.4s\n" + "ldr s8, [%[wbptr], #12]\n" + "fmax v1.4s, v1.4s, v29.4s\n" + "ldr s19, [%[wbptr], #16]\n" + "fmax v16.4s, v16.4s, v29.4s\n" + "ldr x20, [%[outptrs], 8]\n" + "fmax v0.4s, v0.4s, v29.4s\n" + "fmax v15.4s, v15.4s, v29.4s\n" + "str s18, [x20, x28]\n" + "fmla v23.4s, v28.4s, v6.4s\n" + "str s16, [x21, x28]\n" + "fmax v21.4s, v21.4s, v29.4s\n" + "fmax v13.4s, v13.4s, v29.4s\n" + "ldr s7, [%[wbptr], #20]\n" + "fmax v12.4s, v12.4s, v29.4s\n" + "ldr s5, [%[wbptr], #28]\n" + "fmla v23.4s, v30.4s, v3.4s\n" + "ldr s6, [%[wbptr], #24]\n" + "str s13, [x22, x28]\n" + "fmax v11.4s, v11.4s, v29.4s\n" + "fmax v24.4s, v24.4s, v29.4s\n" + "ldr s4, [%[wbptr], #32]\n" + "fmax v14.4s, v14.4s, v29.4s\n" + "ldr s31, [x25, x27]\n" + "fmax v10.4s, v10.4s, v29.4s\n" + "ldr s3, [%[wbptr], #36]\n" + "fmax v20.4s, v20.4s, v29.4s\n" + "ldr s28, [x24, x27]\n" + "str s14, [x23, x28]\n" + "fmax v23.4s, v23.4s, v29.4s\n" + "mov v2.16b, v25.16b\n" + "ldr s29, [x17, x27]\n" + "ldr x20, [%[outptrs], 16]\n" + "ldr x21, [%[outptrs], 40]\n" + "ldr x22, [%[outptrs], 72]\n" + "ldr x23, [%[outptrs], 104]\n" + "ldr x25, [%[inptrs], 16]\n" + "ldr x24, [%[inptrs], 104]\n" + "str s17, [x20, x28]\n" + "mov v16.16b, v25.16b\n" + "str s0, [x21, x28]\n" + "mov v18.16b, v25.16b\n" + "str s12, [x22, x28]\n" + "mov v13.16b, v25.16b\n" + "str s10, [x23, x28]\n" + "mov v0.16b, v25.16b\n" + "fmla v2.4s, v27.4s, v22.4s\n" + "ldr s30, [x25, x27]\n" + "fmla v16.4s, v26.4s, v22.4s\n" + "ldr x20, [%[outptrs], 24]\n" + "mov v17.16b, v25.16b\n" + "ldr x21, [%[outptrs], 48]\n" + "str s1, [x20, x28]\n" + "mov v14.16b, v25.16b\n" + "str s15, [x21, x28]\n" + "mov v12.16b, v25.16b\n" + "mov v15.16b, v25.16b\n" + "ldr x21, [%[outptrs], 56]\n" + "fmla v2.4s, v26.4s, v19.4s\n" + "ldr s27, [x18, x27]\n" + "str s21, [x21, x28]\n" + "ldr x22, [%[outptrs], 80]\n" + "ldr s21, [x24, x27]\n" + "ldr x23, [%[outptrs], 112]\n" + "str s11, [x22, x28]\n" + "fmla v2.4s, v31.4s, v9.4s\n" + "str s20, [x23, x28]\n" + "ldr x22, [%[outptrs], 88]\n" + "ldr x23, [%[outptrs], 120]\n" + "str s24, [x22, x28]\n" + "str s23, [x23, x28]\n" + "add x28, x28, #4\n" + "bne 5b\n" + "6:\n" + "mov v1.16b, v25.16b\n" + "ldr x17, [%[inptrs], 64]\n" + "mov v10.16b, v25.16b\n" + "ldr x25, [%[inptrs], 24]\n" + "mov v11.16b, v25.16b\n" + "ldr x15, [%[inptrs], 192]\n" + "fmla v18.4s, v31.4s, v22.4s\n" + "ldr s23, [x17, x27]\n" + "fmla v2.4s, v28.4s, v5.4s\n" + "ldr x18, [%[inptrs], 152]\n" + "fmla v16.4s, v28.4s, v19.4s\n" + "ldr x24, [%[inptrs], 112]\n" + "fmla v13.4s, v28.4s, v22.4s\n" + "ldr s26, [x25, x27]\n" + "fmla v18.4s, v29.4s, v19.4s\n" + "ldr x17, [%[inptrs], 72]\n" + "fmla v2.4s, v29.4s, v7.4s\n" + "ldr x25, [%[inptrs], 32]\n" + "fmla v16.4s, v29.4s, v9.4s\n" + "ldr x16, [%[inptrs], 240]\n" + "fmla v0.4s, v29.4s, v22.4s\n" + "ldr s28, [x15, x27]\n" + "fmla v18.4s, v30.4s, v9.4s\n" + "ldr x15, [%[inptrs], 200]\n" + "fmla v2.4s, v30.4s, v8.4s\n" + "ldr x20, [%[outptrs], 0]\n" + "fmla v17.4s, v30.4s, v22.4s\n" + "ldr s29, [x18, x27]\n" + "fmla v16.4s, v27.4s, v5.4s\n" + "ldr x18, [%[inptrs], 160]\n" + "fmla v13.4s, v27.4s, v19.4s\n" + "ldr x21, [%[outptrs], 32]\n" + "fmla v14.4s, v27.4s, v22.4s\n" + "ldr s20, [x24, x27]\n" + "fmla v2.4s, v21.4s, v4.4s\n" + "ldr x24, [%[inptrs], 120]\n" + "fmla v16.4s, v21.4s, v7.4s\n" + "ldr x22, [%[outptrs], 64]\n" + "fmla v18.4s, v21.4s, v5.4s\n" + "ldr x23, [%[outptrs], 96]\n" + "fmla v13.4s, v21.4s, v9.4s\n" + "add %[wbptr], %[wbptr], #40\n" + "fmla v0.4s, v21.4s, v19.4s\n" + "prfm pldl1keep, [%[wbptr], #64]\n" + "fmla v12.4s, v21.4s, v22.4s\n" + "ldr s24, [x17, x27]\n" + "fmla v2.4s, v23.4s, v6.4s\n" + "ldr x17, [%[inptrs], 80]\n" + "fmla v16.4s, v23.4s, v8.4s\n" + "fmla v18.4s, v23.4s, v7.4s\n" + "fmla v0.4s, v23.4s, v9.4s\n" + "fmla v17.4s, v23.4s, v19.4s\n" + "fmla v15.4s, v23.4s, v22.4s\n" + "ldr s23, [x25, x27]\n" + "fmla v1.4s, v26.4s, v22.4s\n" + "ldr x25, [%[inptrs], 40]\n" + "fmla v18.4s, v26.4s, v8.4s\n" + "fmla v13.4s, v28.4s, v5.4s\n" + "fmla v17.4s, v26.4s, v9.4s\n" + "ldr s30, [x16, x27]\n" + "fmla v14.4s, v28.4s, v19.4s\n" + "ldr s26, [x15, x27]\n" + "fmla v16.4s, v29.4s, v4.4s\n" + "ldr x16, [%[inptrs], 248]\n" + "fmla v13.4s, v29.4s, v7.4s\n" + "ldr x15, [%[inptrs], 208]\n" + "fmla v0.4s, v29.4s, v5.4s\n" + "fmla v12.4s, v29.4s, v19.4s\n" + "fmla v14.4s, v29.4s, v9.4s\n" + "fmla v10.4s, v29.4s, v22.4s\n" + "mov v21.16b, v25.16b\n" + "fmla v2.4s, v20.4s, v3.4s\n" + "fmla v16.4s, v20.4s, v6.4s\n" + "fmla v18.4s, v20.4s, v4.4s\n" + "fmla v13.4s, v20.4s, v8.4s\n" + "fmla v0.4s, v20.4s, v7.4s\n" + "fmla v17.4s, v20.4s, v5.4s\n" + "fmla v12.4s, v20.4s, v9.4s\n" + "fmla v15.4s, v20.4s, v19.4s\n" + "fmla v11.4s, v20.4s, v22.4s\n" + "mov v20.16b, v25.16b\n" + "fmla v18.4s, v24.4s, v6.4s\n" + "fmla v0.4s, v24.4s, v8.4s\n" + "fmla v1.4s, v24.4s, v19.4s\n" + "fmla v17.4s, v24.4s, v7.4s\n" + "fmla v21.4s, v24.4s, v22.4s\n" + "fmla v15.4s, v24.4s, v9.4s\n" + "ldr s27, [x18, x27]\n" + "fmla v14.4s, v30.4s, v5.4s\n" + "ldr s30, [x24, x27]\n" + "fmla v1.4s, v23.4s, v9.4s\n" + "ldr x18, [%[inptrs], 168]\n" + "fmla v17.4s, v23.4s, v8.4s\n" + "ldr s31, [x17, x27]\n" + "fmla v13.4s, v26.4s, v4.4s\n" + "ldr x24, [%[inptrs], 128]\n" + "fmla v14.4s, v26.4s, v7.4s\n" + "ldr x17, [%[inptrs], 88]\n" + "fmla v12.4s, v26.4s, v5.4s\n" + "fmla v10.4s, v26.4s, v19.4s\n" + "mov v24.16b, v25.16b\n" + "mov v23.16b, v25.16b\n" + "fmla v16.4s, v27.4s, v3.4s\n" + "fmla v13.4s, v27.4s, v6.4s\n" + "fmla v0.4s, v27.4s, v4.4s\n" + "fmla v14.4s, v27.4s, v8.4s\n" + "fmla v12.4s, v27.4s, v7.4s\n" + "fmla v15.4s, v27.4s, v5.4s\n" + "fmla v10.4s, v27.4s, v9.4s\n" + "fmla v11.4s, v27.4s, v19.4s\n" + "fmla v20.4s, v27.4s, v22.4s\n" + "ldr s25, [x25, x27]\n" + "fmla v18.4s, v30.4s, v3.4s\n" + "fmla v0.4s, v30.4s, v6.4s\n" + "fmla v17.4s, v30.4s, v4.4s\n" + "fmla v12.4s, v30.4s, v8.4s\n" + "fmla v15.4s, v30.4s, v7.4s\n" + "fmla v1.4s, v30.4s, v5.4s\n" + "fmla v11.4s, v30.4s, v9.4s\n" + "fmla v21.4s, v30.4s, v19.4s\n" + "fmla v24.4s, v30.4s, v22.4s\n" + "ldr s26, [x16, x27]\n" + "fmla v17.4s, v31.4s, v6.4s\n" + "ldr x16, [%[inptrs], 256]\n" + "fmla v15.4s, v31.4s, v8.4s\n" + "fmla v1.4s, v31.4s, v7.4s\n" + "fmla v21.4s, v31.4s, v9.4s\n" + "ldr s31, [x15, x27]\n" + "fmla v14.4s, v26.4s, v4.4s\n" + "ldr x15, [%[inptrs], 216]\n" + "fmla v10.4s, v26.4s, v5.4s\n" + "ldr s29, [x18, x27]\n" + "fmla v1.4s, v25.4s, v8.4s\n" + "ldr s28, [x24, x27]\n" + "fmla v13.4s, v31.4s, v3.4s\n" + "ldr x18, [%[inptrs], 176]\n" + "fmla v14.4s, v31.4s, v6.4s\n" + "ldr x24, [%[inptrs], 136]\n" + "fmla v12.4s, v31.4s, v4.4s\n" + "fmla v10.4s, v31.4s, v7.4s\n" + "fmla v11.4s, v31.4s, v5.4s\n" + "fmla v20.4s, v31.4s, v19.4s\n" + "fmla v0.4s, v29.4s, v3.4s\n" + "ldr s25, [x17, x27]\n" + "fmla v15.4s, v29.4s, v4.4s\n" + "fmla v21.4s, v29.4s, v5.4s\n" + "fmla v12.4s, v29.4s, v6.4s\n" + "fmla v10.4s, v29.4s, v8.4s\n" + "fmla v11.4s, v29.4s, v7.4s\n" + "fmla v20.4s, v29.4s, v9.4s\n" + "fmla v24.4s, v29.4s, v19.4s\n" + "fmla v23.4s, v29.4s, v22.4s\n" + "fmla v17.4s, v28.4s, v3.4s\n" + "ldr s29, [x16, x27]\n" + "fmla v15.4s, v28.4s, v6.4s\n" + "ldr s22, [x15, x27]\n" + "fmla v1.4s, v28.4s, v4.4s\n" + "ldr x16, [%[inptrs], 264]\n" + "fmla v11.4s, v28.4s, v8.4s\n" + "ldr x15, [%[inptrs], 224]\n" + "fmla v21.4s, v28.4s, v7.4s\n" + "fmla v24.4s, v28.4s, v9.4s\n" + "fmla v14.4s, v29.4s, v3.4s\n" + "ldr s27, [x18, x27]\n" + "fmla v1.4s, v25.4s, v6.4s\n" + "ldr x18, [%[inptrs], 184]\n" + "fmla v10.4s, v29.4s, v4.4s\n" + "fmla v20.4s, v29.4s, v5.4s\n" + "fmla v21.4s, v25.4s, v8.4s\n" + "ldr s26, [x24, x27]\n" + "fmla v12.4s, v22.4s, v3.4s\n" + "ldr s25, [x16, x27]\n" + "fmla v11.4s, v22.4s, v4.4s\n" + "ldr x16, [%[inptrs], 272]\n" + "fmla v10.4s, v22.4s, v6.4s\n" + "fmla v20.4s, v22.4s, v7.4s\n" + "fmla v24.4s, v22.4s, v5.4s\n" + "fmla v23.4s, v22.4s, v19.4s\n" + "fmla v15.4s, v27.4s, v3.4s\n" + "ldr s31, [x15, x27]\n" + "fmla v11.4s, v27.4s, v6.4s\n" + "ldr s22, [x18, x27]\n" + "fmla v21.4s, v27.4s, v4.4s\n" + "ldr x15, [%[inptrs], 232]\n" + "fmla v20.4s, v27.4s, v8.4s\n" + "fmla v24.4s, v27.4s, v7.4s\n" + "fmla v23.4s, v27.4s, v9.4s\n" + "ldr s19, [x16, x27]\n" + "fmla v1.4s, v26.4s, v3.4s\n" + "ldr s28, [x15, x27]\n" + "fmla v21.4s, v26.4s, v6.4s\n" + "ldr x16, [%[inptrs], 280]\n" + "fmla v24.4s, v26.4s, v8.4s\n" + "fmla v10.4s, v25.4s, v3.4s\n" + "fmla v20.4s, v25.4s, v4.4s\n" + "ldr s30, [x16, x27]\n" + "fmla v23.4s, v25.4s, v5.4s\n" + "add x27, x27, #4\n" + "fmla v11.4s, v31.4s, v3.4s\n" + "fmla v21.4s, v22.4s, v3.4s\n" + "fmla v24.4s, v31.4s, v4.4s\n" + "movi v29.16b, #0\n" + "fmla v20.4s, v31.4s, v6.4s\n" + "fmla v23.4s, v31.4s, v7.4s\n" + "fmax v2.4s, v2.4s, v29.4s\n" + "fmax v18.4s, v18.4s, v29.4s\n" + "fmla v24.4s, v22.4s, v6.4s\n" + "fmax v17.4s, v17.4s, v29.4s\n" + "fmla v20.4s, v19.4s, v3.4s\n" + "fmax v1.4s, v1.4s, v29.4s\n" + "str s2, [x20, x28]\n" + "fmla v23.4s, v22.4s, v8.4s\n" + "fmax v16.4s, v16.4s, v29.4s\n" + "ldr x20, [%[outptrs], 8]\n" + "fmla v24.4s, v28.4s, v3.4s\n" + "fmax v0.4s, v0.4s, v29.4s\n" + "str s18, [x20, x28]\n" + "fmax v15.4s, v15.4s, v29.4s\n" + "str s16, [x21, x28]\n" + "fmla v23.4s, v19.4s, v4.4s\n" + "fmax v21.4s, v21.4s, v29.4s\n" + "ldr x20, [%[outptrs], 16]\n" + "fmax v13.4s, v13.4s, v29.4s\n" + "ldr x21, [%[outptrs], 40]\n" + "str s17, [x20, x28]\n" + "fmax v12.4s, v12.4s, v29.4s\n" + "str s0, [x21, x28]\n" + "fmla v23.4s, v28.4s, v6.4s\n" + "str s13, [x22, x28]\n" + "fmax v11.4s, v11.4s, v29.4s\n" + "fmax v24.4s, v24.4s, v29.4s\n" + "ldr x20, [%[outptrs], 24]\n" + "fmax v14.4s, v14.4s, v29.4s\n" + "ldr x21, [%[outptrs], 48]\n" + "str s1, [x20, x28]\n" + "fmla v23.4s, v30.4s, v3.4s\n" + "str s15, [x21, x28]\n" + "fmax v10.4s, v10.4s, v29.4s\n" + "str s14, [x23, x28]\n" + "fmax v20.4s, v20.4s, v29.4s\n" + "ldr x21, [%[outptrs], 56]\n" + "ldr x22, [%[outptrs], 72]\n" + "ldr x23, [%[outptrs], 104]\n" + "fmax v23.4s, v23.4s, v29.4s\n" + "str s21, [x21, x28]\n" + "str s12, [x22, x28]\n" + "str s10, [x23, x28]\n" + "ldr x22, [%[outptrs], 80]\n" + "ldr x23, [%[outptrs], 112]\n" + "str s11, [x22, x28]\n" + "str s20, [x23, x28]\n" + "ldr x22, [%[outptrs], 88]\n" + "ldr x23, [%[outptrs], 120]\n" + "str s24, [x22, x28]\n" + "str s23, [x23, x28]\n" + "add x28, x28, #4\n" + "7:\n" + : [wbptr] "+r" (weight_bias_ptr) + : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs) + : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory" + ); +} + template <> template <> void Conv::execute_tile( diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp index 1ae8128d55..cbdb19a067 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp @@ -63,7 +63,6 @@ DepthwiseConvolution< { } - template < unsigned int OutputTileRows, unsigned int OutputTileCols, unsigned int KernelRows, unsigned int KernelCols, @@ -92,7 +91,6 @@ void DepthwiseConvolution< // Perform the depthwise convolution int channels_remaining = n_channels; -#ifdef __aarch64__ for (; channels_remaining >= 8; channels_remaining -= 8) { // Load input tile @@ -140,6 +138,8 @@ void DepthwiseConvolution< for (unsigned int in_j = 0; in_j < KernelCols; in_j++) { const unsigned int j = base_j + in_j; + + // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j])); } } @@ -168,7 +168,6 @@ void DepthwiseConvolution< } outptr_base += 8; } -#endif // __aarch64__ for (; channels_remaining; channels_remaining--) { // Load input tile @@ -244,5 +243,172 @@ void DepthwiseConvolution< } } +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +template +void DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, StrideRows, StrideCols, + float16_t, float16_t, float16_t +>::execute_tile( + int n_channels, + const void *weights_biases_ptr, + const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + // Instantiate pointers + const float16_t* __restrict__ params = static_cast(weights_biases_ptr); + int n = 0; + + // Perform the depthwise convolution + int channels_remaining = n_channels; + for (; channels_remaining >= 8; channels_remaining -= 8, n += 8) + { + // Load input tile + float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols]; + for (int i = 0; i < Base::inner_tile_rows; i++) + { + for (int j = 0; j < Base::inner_tile_cols; j++) + { + u[i][j] = vld1q_f16(inptrs[i][j] + n); + } + } + + // Load weights tile + float16x8_t vbias = vld1q_f16(params); + params += 8; + + float16x8_t w[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + w[i][j] = vld1q_f16(params); + params += 8; + } + } + + // Perform the convolution + float16x8_t v[OutputTileRows][OutputTileCols]; + for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) + { + for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) + { + v[out_i][out_j] = vbias; + + // Base co-ordinate + const int base_i = out_i * StrideRows; + const int base_j = out_j * StrideCols; + + // Fill the accumulator + for (unsigned int in_i = 0; in_i < KernelRows; in_i++) + { + const unsigned int i = base_i + in_i; + for (unsigned int in_j = 0; in_j < KernelCols; in_j++) + { + const unsigned int j = base_j + in_j; + + // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j])); + } + } + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f)); + } + if (Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f)); + } + } + } + + // Store the output tile + for (unsigned int i = 0; i < OutputTileRows; i++) + { + for (unsigned int j = 0; j < OutputTileCols; j++) + { + vst1q_f16(outptrs[i][j] + n, v[i][j]); + } + } + } + for (; channels_remaining; channels_remaining--, n++) + { + // Load input tile + float16_t u[Base::inner_tile_rows][Base::inner_tile_cols]; + for (int i = 0; i < Base::inner_tile_rows; i++) + { + for (int j = 0; j < Base::inner_tile_cols; j++) + { + u[i][j] = *(inptrs[i][j] + n); + } + } + + // Load weights tile + float16_t bias = *(params++); + float16_t w[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + w[i][j] = *(params++); + } + } + + // Perform the convolution + float16_t v[OutputTileRows][OutputTileCols]; + for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) + { + for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) + { + // Clear the accumulator + v[out_i][out_j] = bias; + + // Base co-ordinate + const int base_i = out_i * StrideRows; + const int base_j = out_j * StrideCols; + + // Fill the accumulator + for (unsigned int in_i = 0; in_i < KernelRows; in_i++) + { + const unsigned int i = base_i + in_i; + for (unsigned int in_j = 0; in_j < KernelCols; in_j++) + { + const int j = base_j + in_j; + v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + } + } + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); + } + if (Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); + } + } + } + + // Store the output tile + for (unsigned int i = 0; i < OutputTileRows; i++) + { + for (unsigned int j = 0; j < OutputTileCols; j++) + { + *(outptrs[i][j] + n) = v[i][j]; + } + } + } +} + } // namespace depthwise #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp index 10d110feb8..264576137c 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp @@ -92,7 +92,6 @@ void DepthwiseConvolution< // Perform the depthwise convolution int channels_remaining = n_channels; -#ifdef __aarch64__ for (; channels_remaining >= 4; channels_remaining -= 4) { // Load input tile @@ -170,7 +169,6 @@ void DepthwiseConvolution< } outptr_base += 4; } -#endif // __aarch64__ for (; channels_remaining; channels_remaining--) { // Load input tile @@ -246,4 +244,171 @@ void DepthwiseConvolution< } } + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +template +void DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, StrideRows, StrideCols, + float, float, float +>::execute_tile( + int n_channels, + const void *weights_biases_ptr, + const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + float *outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + const float* __restrict__ params = static_cast(weights_biases_ptr); + + // Perform the depthwise convolution + int channels_remaining = n_channels; + int n = 0; + for (; channels_remaining >= 4; channels_remaining -= 4, n += 4) + { + // Load input tile + float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols]; + for (int i = 0; i < Base::inner_tile_rows; i++) + { + for (int j = 0; j < Base::inner_tile_cols; j++) + { + u[i][j] = vld1q_f32(inptrs[i][j] + n); + } + } + + // Load weights tile + float32x4_t vbias = vld1q_f32(params); + params += 4; + + float32x4_t w[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + w[i][j] = vld1q_f32(params); + params += 4; + } + } + + // Perform the convolution + float32x4_t v[OutputTileRows][OutputTileCols]; + for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) + { + for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) + { + v[out_i][out_j] = vbias; + + // Base co-ordinate + const int base_i = out_i * StrideRows; + const int base_j = out_j * StrideCols; + + // Fill the accumulator + for (unsigned int in_i = 0; in_i < KernelRows; in_i++) + { + const unsigned int i = base_i + in_i; + for (unsigned int in_j = 0; in_j < KernelCols; in_j++) + { + const unsigned int j = base_j + in_j; + + // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]); + } + } + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f)); + } + if (Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f)); + } + } + } + + // Store the output tile + for (unsigned int i = 0; i < OutputTileRows; i++) + { + for (unsigned int j = 0; j < OutputTileCols; j++) + { + vst1q_f32(outptrs[i][j] + n, v[i][j]); + } + } + } + for (; channels_remaining; channels_remaining--, n++) + { + // Load input tile + float u[Base::inner_tile_rows][Base::inner_tile_cols]; + for (int i = 0; i < Base::inner_tile_rows; i++) + { + for (int j = 0; j < Base::inner_tile_cols; j++) + { + u[i][j] = *(inptrs[i][j] + n); + } + } + + // Load weights tile + float bias = *(params++); + float w[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + w[i][j] = *(params++); + } + } + + // Perform the convolution + float v[OutputTileRows][OutputTileCols]; + for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) + { + for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) + { + // Clear the accumulator + v[out_i][out_j] = bias; + + // Base co-ordinate + const int base_i = out_i * StrideRows; + const int base_j = out_j * StrideCols; + + // Fill the accumulator + for (unsigned int in_i = 0; in_i < KernelRows; in_i++) + { + const unsigned int i = base_i + in_i; + for (unsigned int in_j = 0; in_j < KernelCols; in_j++) + { + const int j = base_j + in_j; + v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + } + } + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); + } + if (Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); + } + } + } + + // Store the output tile + for (unsigned int i = 0; i < OutputTileRows; i++) + { + for (unsigned int j = 0; j < OutputTileCols; j++) + { + *(outptrs[i][j] + n) = v[i][j]; + } + } + } +} + } // namespace depthwise diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp index 72f7c6b511..be73065b00 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp @@ -631,4 +631,329 @@ void QAsymm8DepthwiseConvolution< } } +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +template +void QAsymm8DepthwiseConvolution< + OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols +>::execute_tile( + int n_channels, + const void* packed_params, + const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + // Activation parameters (unused if Activation is None) + const uint8_t aqmin = _output_quant.offset; + const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ? + std::min(255u, _output_quant.quantize(6.0f)) : 255u; + + // Byte type pointer to weights and biases + const uint8_t *wbptr = static_cast(packed_params); + + // Offset into input/output tensors + int n = 0; + +#if defined(__aarch64__) // Under Aarch64 only use quad registers + for (; n_channels >= 16; n_channels -= 16, n += 16) + { + // Load biases + const int32x4_t biases[4] = { + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), + vld1q_s32(reinterpret_cast(wbptr) + 8), + vld1q_s32(reinterpret_cast(wbptr) + 12) + }; + wbptr += 16*sizeof(int32_t); + + // Load weights + uint8x16_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = vld1q_u8(wbptr); + wbptr += 16; + } + } + + // Load the input activations + uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = vld1q_u8(inptrs[i][j] + n); + } + } + + // Perform the convolution + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + // Two sets of operations are required, we perform the + // multiply-accumulates for the convolution proper but must also sum + // the tile elements to account for the _weight_ offset. + uint32x4_t accs[4]; + for (unsigned int i = 0; i < 4; i++) + { + accs[i] = reinterpret_cast(biases[i]); + } + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + // Get relevant weight and activation pixel + const uint8x16_t w = weights[wi][wj]; + const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + + // Perform multiplication and accumulation + const uint16x8_t muls[2] = { + vmull_u8(vget_low_u8(w), vget_low_u8(x)), + vmull_u8(vget_high_u8(w), vget_high_u8(x)) + }; + + const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset); + const uint16x8_t sum_elems[2] = { + vmull_u8(vget_low_u8(x), woffset), + vmull_u8(vget_high_u8(x), woffset) + }; + + const uint32x4_t tmps[4] = { + vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])), + vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])), + vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])), + vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])), + }; + for (unsigned int i = 0; i < 4; i++) + { + accs[i] = vaddq_u32(accs[i], tmps[i]); + } + } + } + + // Rescale the accumulator and add in the new offset. + uint32x4_t final_accs[4]; + for (unsigned int i = 0; i < 4; i++) + { +#ifdef FIXED_POINT_REQUANTISATION + const int32x4_t y = rounding_divide_by_exp2( + saturating_doubling_high_mul( + reinterpret_cast(accs[i]), rescale_parameters.multiplier + ), + rescale_parameters.shift + ); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(_output_quant.offset)); + final_accs[i] = reinterpret_cast(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0))); +#else // floating point requantisation + float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast(accs[i])); + fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale)); + fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast(_output_quant.offset))); + fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f)); + final_accs[i] = vcvtq_u32_f32(fp_acc); +#endif + } + + uint8x16_t output = vcombine_u8( + vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))), + vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3]))) + ); + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = vmaxq_u8(output, vdupq_n_u8(aqmin)); + } + if (Activation == ActivationFunction::ReLU6) + { + output = vminq_u8(output, vdupq_n_u8(aqmax)); + } + + vst1q_u8(outptrs[oi][oj] + n, output); + } + } + } +#endif // defined(__aarch64__) + for (; n_channels >= 8; n_channels -= 8, n += 8) + { + const int32x4_t biases[2] = { + vld1q_s32(reinterpret_cast(wbptr)), + vld1q_s32(reinterpret_cast(wbptr) + 4), + }; + wbptr += 8*sizeof(int32_t); + + uint8x8_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = vld1_u8(wbptr); + wbptr += 8; + } + } + + uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = vld1_u8(inptrs[i][j] + n); + } + } + + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + uint32x4_t accs[2]; + for (unsigned int i = 0; i < 2; i++) + { + accs[i] = reinterpret_cast(biases[i]); + } + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + const uint8x8_t w = weights[wi][wj]; + const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + + const uint16x8_t muls = vmull_u8(w, x); + const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset); + const uint16x8_t sum_elems = vmull_u8(x, woffset); + + const uint32x4_t tmps[2] = { + vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)), + vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)), + }; + for (unsigned int i = 0; i < 2; i++) + { + accs[i] = vaddq_u32(accs[i], tmps[i]); + } + } + } + + uint32x4_t final_accs[2]; + for (unsigned int i = 0; i < 2; i++) + { +#ifdef FIXED_POINT_REQUANTISATION + const int32x4_t y = rounding_divide_by_exp2( + saturating_doubling_high_mul( + reinterpret_cast(accs[i]), rescale_parameters.multiplier + ), + rescale_parameters.shift + ); + const int32x4_t offset = reinterpret_cast(vdupq_n_u32(_output_quant.offset)); + final_accs[i] = reinterpret_cast(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0))); +#else // floating point requantisation + float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast(accs[i])); + fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale)); + fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast(_output_quant.offset))); + fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f)); + final_accs[i] = vcvtq_u32_f32(fp_acc); +#endif + } + + uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))); + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = vmax_u8(output, vdup_n_u8(aqmin)); + } + if (Activation == ActivationFunction::ReLU6) + { + output = vmin_u8(output, vdup_n_u8(aqmax)); + } + + vst1_u8(outptrs[oi][oj] + n, output); + } + } + } + for (; n_channels; n_channels--, n++) + { + // Load bias + const int32_t bias = *reinterpret_cast(wbptr); + wbptr += sizeof(int32_t); + + // Load weights + uint8_t weights[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + weights[i][j] = *(wbptr++); + } + } + + // Load the input activations + uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols]; + for (unsigned int i = 0; i < Base::inner_tile_rows; i++) + { + for (unsigned int j = 0; j < Base::inner_tile_cols; j++) + { + inputs[i][j] = *(inptrs[i][j] + n); + } + } + + // Perform the convolution + for (unsigned int oi = 0; oi < OutputTileRows; oi++) + { + for (unsigned int oj = 0; oj < OutputTileCols; oj++) + { + int32_t acc = bias; + uint32_t element_sum = 0; + + for (unsigned int wi = 0; wi < KernelRows; wi++) + { + for (unsigned int wj = 0; wj < KernelCols; wj++) + { + const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj]; + acc += static_cast(static_cast(w) * static_cast(x)); + element_sum += static_cast(x); + } + } + + acc -= static_cast(element_sum) * static_cast(_weights_quant.offset); + + // Requantize +#ifdef FIXED_POINT_REQUANTISATION + acc = rounding_divide_by_exp2( + saturating_doubling_high_mul(acc, rescale_parameters.multiplier), + rescale_parameters.shift + ); + acc += _output_quant.offset; + uint8_t output = clamp_to_limits::clamp_and_cast(acc); +#else // floating point requantization + float fp_acc = static_cast(acc); + fp_acc *= rescale_parameters.rescale; + fp_acc += static_cast(_output_quant.offset); + fp_acc = std::max(fp_acc, 0.0f); + uint8_t output = static_cast(std::min(static_cast(fp_acc), 255)); +#endif + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + output = std::max(output, aqmin); + } + if (Activation == ActivationFunction::ReLU6) + { + output = std::min(output, aqmax); + } + + *(outptrs[oi][oj] + n) = output; + } + } + } +} + } // namespace depthwise -- cgit v1.2.1