/* * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "impl_fp32_fp32.hpp" namespace depthwise { using namespace neon_convolution_kernels; using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; #ifdef __aarch64__ template <> template <> void Conv::execute_tile( int n_channels, const void *weight_bias_ptr, const float *input, const unsigned int input_row_stride, const unsigned int input_col_stride, float *output, const unsigned int output_row_stride, const unsigned int output_col_stride ) { __asm __volatile( "add x8, %[inptr0], %[input_row_stride]\n" "add x15, %[input_col_stride1], %[input_col_stride1]\n" "add x23, %[outptr0], %[output_row_stride]\n" "add x9, x8, %[input_row_stride]\n" "add x16, x15, #64\n" "add x17, x15, %[input_col_stride1]\n" "add x10, x9, %[input_row_stride]\n" "add x18, x17, #64\n" "add x19, x17, %[input_col_stride1]\n" "add x11, x10, %[input_row_stride]\n" "add x20, x19, #64\n" "add x21, x19, %[input_col_stride1]\n" "add x12, x11, %[input_row_stride]\n" "add x22, x21, #64\n" "add x24, x23, %[output_row_stride]\n" "add x25, x24, %[output_row_stride]\n" "add x26, %[output_col_stride1], %[output_col_stride1]\n" "and x13, %[n_channels], #3\n" "add x27, x26, %[output_col_stride1]\n" "lsr x14, %[n_channels], #2\n" "cbz x14, 4f\n" "1:\n" "ldr q14, [%[wbptr]]\n" "subs x14, x14, #1\n" "mov v17.16b, v14.16b\n" "ldr q12, [%[wbptr], #16]\n" "mov v23.16b, v14.16b\n" "ldr q11, [%[wbptr], #32]\n" "mov v24.16b, v14.16b\n" "ldr q10, [%[wbptr], #48]\n" "mov v20.16b, v14.16b\n" "ldr q9, [%[wbptr], #64]\n" "mov v16.16b, v14.16b\n" "ldr q8, [%[wbptr], #80]\n" "mov v13.16b, v14.16b\n" "ldr q7, [%[wbptr], #96]\n" "mov v0.16b, v14.16b\n" "ldr q6, [%[wbptr], #112]\n" "mov v1.16b, v14.16b\n" "ldr q5, [%[wbptr], #128]\n" "mov v2.16b, v14.16b\n" "ldr q4, [%[wbptr], #144]\n" "mov v3.16b, v14.16b\n" "ldr q29, [%[inptr0]]\n" "fmla v17.4s, v29.4s, v12.4s\n" "ldr q28, [x8]\n" "ldr q30, [%[inptr0], %[input_col_stride1]]\n" "ldr q25, [x9]\n" "ldr q26, [x8, %[input_col_stride1]]\n" "ldr q27, [%[inptr0], x15]\n" "ldr q15, [x10]\n" "ldr q18, [x9, %[input_col_stride1]]\n" "prfm pldl1keep, [%[inptr0], #64]\n" "prfm pldl1keep, [x8, #64]\n" "prfm pldl1keep, [%[inptr0], x28]\n" "prfm pldl1keep, [x9, #64]\n" "prfm pldl1keep, [x8, x28]\n" "prfm pldl1keep, [%[inptr0], x16]\n" "prfm pldl1keep, [x10, #64]\n" "prfm pldl1keep, [x9, x28]\n" "beq 3f\n" "2:\n" "fmla v17.4s, v28.4s, v9.4s\n" "prfm pldl1keep, [x8, x16]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr q22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr q29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x11, #64]\n" "fmla v20.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x10, x28]\n" "fmla v17.4s, v25.4s, v6.4s\n" "ldr q25, [x11]\n" "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" "prfm pldl1keep, [x8, x18]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" "ldr q28, [x10, %[input_col_stride1]]\n" "fmla v24.4s, v27.4s, v11.4s\n" "prfm pldl1keep, [x12, #64]\n" "fmla v17.4s, v27.4s, v10.4s\n" "prfm pldl1keep, [x11, x28]\n" "fmla v13.4s, v27.4s, v12.4s\n" "ldr q19, [x9, x15]\n" "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr q21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" "prfm pldl1keep, [x8, x20]\n" "fmla v23.4s, v18.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x22]\n" "fmla v24.4s, v18.4s, v6.4s\n" "prfm pldl1keep, [x12, x28]\n" "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" "prfm pldl1keep, [x10, x18]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr q27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" "prfm pldl1keep, [x9, x20]\n" "fmla v23.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x8, x22]\n" "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x11, x18]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" "ldr q18, [x12]\n" "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" "prfm pldl1keep, [x12, x18]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr q22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x11, x20]\n" "fmla v0.4s, v25.4s, v9.4s\n" "ldr q25, [x10, x15]\n" "fmla v23.4s, v28.4s, v5.4s\n" "prfm pldl1keep, [x10, x22]\n" "fmla v20.4s, v28.4s, v8.4s\n" "prfm pldl1keep, [x12, x20]\n" "fmla v16.4s, v28.4s, v6.4s\n" "prfm pldl1keep, [x11, x22]\n" "fmla v0.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [x12, x22]\n" "fmla v1.4s, v28.4s, v9.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v17.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v23.4s, v19.4s, v7.4s\n" "subs x14, x14, #1\n" "fmla v24.4s, v19.4s, v5.4s\n" "fmla v20.4s, v19.4s, v10.4s\n" "str q17, [%[outptr0]]\n" "mov v15.16b, v14.16b\n" "fmla v16.4s, v19.4s, v8.4s\n" "fmla v13.4s, v19.4s, v6.4s\n" "fmla v15.4s, v28.4s, v12.4s\n" "ldr q29, [x9, x17]\n" "fmla v1.4s, v19.4s, v11.4s\n" "fmla v2.4s, v19.4s, v9.4s\n" "fmla v24.4s, v21.4s, v7.4s\n" "fmla v16.4s, v21.4s, v10.4s\n" "fmla v13.4s, v21.4s, v8.4s\n" "fmla v3.4s, v21.4s, v9.4s\n" "fmla v2.4s, v21.4s, v11.4s\n" "fmla v0.4s, v18.4s, v6.4s\n" "mov v18.16b, v14.16b\n" "fmla v20.4s, v22.4s, v5.4s\n" "fmla v13.4s, v27.4s, v10.4s\n" "fmla v3.4s, v27.4s, v11.4s\n" "mov v17.16b, v14.16b\n" "fmla v18.4s, v19.4s, v12.4s\n" "mov v19.16b, v14.16b\n" "fmla v0.4s, v22.4s, v8.4s\n" "fmla v17.4s, v21.4s, v12.4s\n" "ldr q26, [x8, x19]\n" "fmla v1.4s, v22.4s, v6.4s\n" "fmla v15.4s, v22.4s, v9.4s\n" "mov v22.16b, v14.16b\n" "mov v21.16b, v14.16b\n" "fmla v23.4s, v25.4s, v4.4s\n" "fmla v20.4s, v25.4s, v7.4s\n" "fmla v16.4s, v25.4s, v5.4s\n" "fmla v0.4s, v25.4s, v10.4s\n" "fmla v1.4s, v25.4s, v8.4s\n" "fmla v2.4s, v25.4s, v6.4s\n" "str q23, [x23]\n" "fmla v15.4s, v25.4s, v11.4s\n" "fmla v18.4s, v25.4s, v9.4s\n" "ldr q28, [%[inptr0], x21]\n" "fmla v19.4s, v25.4s, v12.4s\n" "ldr q30, [x12, %[input_col_stride1]]\n" "fmla v24.4s, v29.4s, v4.4s\n" "add %[inptr0], %[inptr0], #16\n" "fmla v16.4s, v29.4s, v7.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "fmla v13.4s, v29.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x28]\n" "str q24, [%[outptr0], %[output_col_stride1]]\n" "fmla v1.4s, v29.4s, v10.4s\n" "fmla v2.4s, v29.4s, v8.4s\n" "ldr q27, [x11, x15]\n" "fmla v3.4s, v29.4s, v6.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v18.4s, v29.4s, v11.4s\n" "fmla v17.4s, v29.4s, v9.4s\n" "fmla v22.4s, v29.4s, v12.4s\n" "ldr q23, [x10, x17]\n" "fmla v13.4s, v26.4s, v7.4s\n" "fmla v2.4s, v26.4s, v10.4s\n" "fmla v3.4s, v26.4s, v8.4s\n" "fmla v17.4s, v26.4s, v11.4s\n" "fmla v0.4s, v30.4s, v5.4s\n" "ldr q24, [x9, x19]\n" "fmla v15.4s, v30.4s, v6.4s\n" "ldr q29, [x8, x21]\n" "fmla v3.4s, v28.4s, v10.4s\n" "ldr q14, [x12, x15]\n" "fmla v20.4s, v27.4s, v4.4s\n" "add x8, x8, #16\n" "fmla v0.4s, v27.4s, v7.4s\n" "prfm pldl1keep, [x8, #64]\n" "fmla v1.4s, v27.4s, v5.4s\n" "prfm pldl1keep, [x8, x28]\n" "str q20, [x24]\n" "fmla v15.4s, v27.4s, v8.4s\n" "fmla v18.4s, v27.4s, v6.4s\n" "ldr q25, [x11, x17]\n" "fmla v19.4s, v27.4s, v9.4s\n" "ldr q30, [x10, x19]\n" "fmla v16.4s, v23.4s, v4.4s\n" "fmla v1.4s, v23.4s, v7.4s\n" "fmla v2.4s, v23.4s, v5.4s\n" "fmla v15.4s, v23.4s, v10.4s\n" "fmla v18.4s, v23.4s, v8.4s\n" "fmla v17.4s, v23.4s, v6.4s\n" "str q16, [x23, %[output_col_stride1]]\n" "fmla v19.4s, v23.4s, v11.4s\n" "fmla v22.4s, v23.4s, v9.4s\n" "ldr q26, [x9, x21]\n" "fmla v21.4s, v23.4s, v12.4s\n" "ldr q27, [x12, x17]\n" "fmla v13.4s, v24.4s, v4.4s\n" "ldr q20, [x11, x19]\n" "fmla v2.4s, v24.4s, v7.4s\n" "add x9, x9, #16\n" "fmla v3.4s, v24.4s, v5.4s\n" "prfm pldl1keep, [x9, #64]\n" "str q13, [%[outptr0], x26]\n" "fmla v18.4s, v24.4s, v10.4s\n" "fmla v17.4s, v24.4s, v8.4s\n" "ldr q23, [x10, x21]\n" "fmla v22.4s, v24.4s, v11.4s\n" "ldr q24, [x12, x19]\n" "fmla v3.4s, v29.4s, v7.4s\n" "prfm pldl1keep, [x9, x28]\n" "fmla v17.4s, v29.4s, v10.4s\n" "ldr q16, [x11, x21]\n" "fmla v0.4s, v14.4s, v4.4s\n" "add x10, x10, #16\n" "fmla v15.4s, v14.4s, v5.4s\n" "prfm pldl1keep, [x10, #64]\n" "fmla v19.4s, v14.4s, v6.4s\n" "ldr q13, [x12, x21]\n" "str q0, [x25]\n" "fmla v1.4s, v25.4s, v4.4s\n" "fmla v15.4s, v25.4s, v7.4s\n" "ldr q14, [%[wbptr]]\n" "fmla v18.4s, v25.4s, v5.4s\n" "add x11, x11, #16\n" "str q1, [x24, %[output_col_stride1]]\n" "fmla v19.4s, v25.4s, v8.4s\n" "fmla v22.4s, v25.4s, v6.4s\n" "ldr q12, [%[wbptr], #16]\n" "fmla v21.4s, v25.4s, v9.4s\n" "ldr q29, [%[inptr0]]\n" "fmla v2.4s, v30.4s, v4.4s\n" "ldr q28, [x8]\n" "fmla v18.4s, v30.4s, v7.4s\n" "add x12, x12, #16\n" "fmla v17.4s, v30.4s, v5.4s\n" "fmla v19.4s, v30.4s, v10.4s\n" "str q2, [x23, x26]\n" "fmla v22.4s, v30.4s, v8.4s\n" "fmla v21.4s, v30.4s, v11.4s\n" "ldr q9, [%[wbptr], #64]\n" "fmla v3.4s, v26.4s, v4.4s\n" "ldr q30, [%[inptr0], %[input_col_stride1]]\n" "fmla v17.4s, v26.4s, v7.4s\n" "ldr q25, [x9]\n" "fmla v22.4s, v26.4s, v10.4s\n" "ldr q11, [%[wbptr], #32]\n" "str q3, [%[outptr0], x27]\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v19.4s, v27.4s, v5.4s\n" "ldr q26, [x8, %[input_col_stride1]]\n" "fmla v21.4s, v27.4s, v6.4s\n" "ldr q27, [%[inptr0], x15]\n" "str q15, [x25, %[output_col_stride1]]\n" "fmla v18.4s, v20.4s, v4.4s\n" "fmla v19.4s, v20.4s, v7.4s\n" "ldr q15, [x10]\n" "fmla v22.4s, v20.4s, v5.4s\n" "ldr q6, [%[wbptr], #112]\n" "str q18, [x24, x26]\n" "fmla v21.4s, v20.4s, v8.4s\n" "fmla v17.4s, v23.4s, v4.4s\n" "ldr q18, [x9, %[input_col_stride1]]\n" "fmla v22.4s, v23.4s, v7.4s\n" "add %[outptr0], %[outptr0], #16\n" "fmla v21.4s, v23.4s, v10.4s\n" "ldr q8, [%[wbptr], #80]\n" "str q17, [x23, x27]\n" "fmla v19.4s, v24.4s, v4.4s\n" "fmla v22.4s, v16.4s, v4.4s\n" "add x23, x23, #16\n" "fmla v21.4s, v24.4s, v5.4s\n" "ldr q10, [%[wbptr], #48]\n" "str q19, [x25, x26]\n" "mov v17.16b, v14.16b\n" "str q22, [x24, x27]\n" "mov v23.16b, v14.16b\n" "fmla v21.4s, v16.4s, v7.4s\n" "ldr q5, [%[wbptr], #128]\n" "mov v24.16b, v14.16b\n" "add x24, x24, #16\n" "mov v20.16b, v14.16b\n" "mov v16.16b, v14.16b\n" "fmla v21.4s, v13.4s, v4.4s\n" "ldr q7, [%[wbptr], #96]\n" "mov v13.16b, v14.16b\n" "mov v0.16b, v14.16b\n" "mov v1.16b, v14.16b\n" "mov v2.16b, v14.16b\n" "str q21, [x25, x27]\n" "mov v3.16b, v14.16b\n" "ldr q4, [%[wbptr], #144]\n" "add x25, x25, #16\n" "fmla v17.4s, v29.4s, v12.4s\n" "bne 2b\n" "3:\n" "fmla v17.4s, v28.4s, v9.4s\n" "prfm pldl1keep, [x8, x16]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr q22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr q29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x11, #64]\n" "fmla v20.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x10, x28]\n" "fmla v17.4s, v25.4s, v6.4s\n" "ldr q25, [x11]\n" "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" "prfm pldl1keep, [x8, x18]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" "ldr q28, [x10, %[input_col_stride1]]\n" "fmla v24.4s, v27.4s, v11.4s\n" "prfm pldl1keep, [x12, #64]\n" "fmla v17.4s, v27.4s, v10.4s\n" "prfm pldl1keep, [x11, x28]\n" "fmla v13.4s, v27.4s, v12.4s\n" "ldr q19, [x9, x15]\n" "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr q21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" "prfm pldl1keep, [x8, x20]\n" "fmla v23.4s, v18.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x22]\n" "fmla v24.4s, v18.4s, v6.4s\n" "prfm pldl1keep, [x12, x28]\n" "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" "prfm pldl1keep, [x10, x18]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr q27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" "prfm pldl1keep, [x9, x20]\n" "fmla v23.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x8, x22]\n" "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x11, x18]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" "ldr q18, [x12]\n" "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" "prfm pldl1keep, [x12, x18]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr q22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x11, x20]\n" "fmla v0.4s, v25.4s, v9.4s\n" "ldr q25, [x10, x15]\n" "fmla v23.4s, v28.4s, v5.4s\n" "prfm pldl1keep, [x10, x22]\n" "fmla v20.4s, v28.4s, v8.4s\n" "prfm pldl1keep, [x12, x20]\n" "fmla v16.4s, v28.4s, v6.4s\n" "prfm pldl1keep, [x11, x22]\n" "fmla v0.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [x12, x22]\n" "fmla v1.4s, v28.4s, v9.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v17.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v23.4s, v19.4s, v7.4s\n" "fmla v24.4s, v19.4s, v5.4s\n" "fmla v20.4s, v19.4s, v10.4s\n" "fmla v16.4s, v19.4s, v8.4s\n" "str q17, [%[outptr0]]\n" "mov v15.16b, v14.16b\n" "fmla v13.4s, v19.4s, v6.4s\n" "fmla v1.4s, v19.4s, v11.4s\n" "fmla v15.4s, v28.4s, v12.4s\n" "ldr q29, [x9, x17]\n" "fmla v2.4s, v19.4s, v9.4s\n" "fmla v24.4s, v21.4s, v7.4s\n" "fmla v16.4s, v21.4s, v10.4s\n" "fmla v13.4s, v21.4s, v8.4s\n" "fmla v3.4s, v21.4s, v9.4s\n" "fmla v0.4s, v18.4s, v6.4s\n" "mov v18.16b, v14.16b\n" "fmla v2.4s, v21.4s, v11.4s\n" "fmla v13.4s, v27.4s, v10.4s\n" "fmla v20.4s, v22.4s, v5.4s\n" "fmla v18.4s, v19.4s, v12.4s\n" "ldr q26, [x8, x19]\n" "fmla v3.4s, v27.4s, v11.4s\n" "ldr q28, [%[inptr0], x21]\n" "fmla v0.4s, v22.4s, v8.4s\n" "add %[inptr0], %[inptr0], #16\n" "fmla v1.4s, v22.4s, v6.4s\n" "fmla v15.4s, v22.4s, v9.4s\n" "mov v17.16b, v14.16b\n" "fmla v23.4s, v25.4s, v4.4s\n" "fmla v20.4s, v25.4s, v7.4s\n" "fmla v16.4s, v25.4s, v5.4s\n" "fmla v17.4s, v21.4s, v12.4s\n" "ldr q30, [x12, %[input_col_stride1]]\n" "str q23, [x23]\n" "mov v19.16b, v14.16b\n" "fmla v0.4s, v25.4s, v10.4s\n" "fmla v1.4s, v25.4s, v8.4s\n" "fmla v2.4s, v25.4s, v6.4s\n" "fmla v15.4s, v25.4s, v11.4s\n" "fmla v18.4s, v25.4s, v9.4s\n" "fmla v19.4s, v25.4s, v12.4s\n" "mov v22.16b, v14.16b\n" "mov v21.16b, v14.16b\n" "fmla v24.4s, v29.4s, v4.4s\n" "fmla v16.4s, v29.4s, v7.4s\n" "fmla v13.4s, v29.4s, v5.4s\n" "fmla v1.4s, v29.4s, v10.4s\n" "fmla v2.4s, v29.4s, v8.4s\n" "fmla v3.4s, v29.4s, v6.4s\n" "str q24, [%[outptr0], %[output_col_stride1]]\n" "fmla v18.4s, v29.4s, v11.4s\n" "fmla v17.4s, v29.4s, v9.4s\n" "ldr q27, [x11, x15]\n" "fmla v22.4s, v29.4s, v12.4s\n" "ldr q23, [x10, x17]\n" "fmla v13.4s, v26.4s, v7.4s\n" "fmla v2.4s, v26.4s, v10.4s\n" "fmla v3.4s, v26.4s, v8.4s\n" "fmla v17.4s, v26.4s, v11.4s\n" "fmla v0.4s, v30.4s, v5.4s\n" "ldr q24, [x9, x19]\n" "fmla v15.4s, v30.4s, v6.4s\n" "ldr q29, [x8, x21]\n" "fmla v3.4s, v28.4s, v10.4s\n" "ldr q14, [x12, x15]\n" "fmla v20.4s, v27.4s, v4.4s\n" "add x8, x8, #16\n" "fmla v0.4s, v27.4s, v7.4s\n" "fmla v1.4s, v27.4s, v5.4s\n" "fmla v15.4s, v27.4s, v8.4s\n" "fmla v18.4s, v27.4s, v6.4s\n" "str q20, [x24]\n" "fmla v19.4s, v27.4s, v9.4s\n" "fmla v16.4s, v23.4s, v4.4s\n" "ldr q25, [x11, x17]\n" "fmla v1.4s, v23.4s, v7.4s\n" "ldr q30, [x10, x19]\n" "fmla v2.4s, v23.4s, v5.4s\n" "fmla v15.4s, v23.4s, v10.4s\n" "str q16, [x23, %[output_col_stride1]]\n" "fmla v18.4s, v23.4s, v8.4s\n" "fmla v17.4s, v23.4s, v6.4s\n" "ldr q26, [x9, x21]\n" "fmla v19.4s, v23.4s, v11.4s\n" "add x9, x9, #16\n" "fmla v22.4s, v23.4s, v9.4s\n" "fmla v21.4s, v23.4s, v12.4s\n" "fmla v13.4s, v24.4s, v4.4s\n" "ldr q27, [x12, x17]\n" "fmla v2.4s, v24.4s, v7.4s\n" "ldr q20, [x11, x19]\n" "fmla v3.4s, v24.4s, v5.4s\n" "fmla v18.4s, v24.4s, v10.4s\n" "str q13, [%[outptr0], x26]\n" "fmla v17.4s, v24.4s, v8.4s\n" "fmla v22.4s, v24.4s, v11.4s\n" "ldr q23, [x10, x21]\n" "fmla v3.4s, v29.4s, v7.4s\n" "ldr q24, [x12, x19]\n" "fmla v17.4s, v29.4s, v10.4s\n" "ldr q16, [x11, x21]\n" "fmla v0.4s, v14.4s, v4.4s\n" "add x10, x10, #16\n" "fmla v15.4s, v14.4s, v5.4s\n" "add x11, x11, #16\n" "fmla v19.4s, v14.4s, v6.4s\n" "ldr q13, [x12, x21]\n" "str q0, [x25]\n" "fmla v1.4s, v25.4s, v4.4s\n" "fmla v15.4s, v25.4s, v7.4s\n" "add x12, x12, #16\n" "fmla v18.4s, v25.4s, v5.4s\n" "fmla v19.4s, v25.4s, v8.4s\n" "str q1, [x24, %[output_col_stride1]]\n" "fmla v22.4s, v25.4s, v6.4s\n" "fmla v21.4s, v25.4s, v9.4s\n" "fmla v2.4s, v30.4s, v4.4s\n" "fmla v18.4s, v30.4s, v7.4s\n" "fmla v17.4s, v30.4s, v5.4s\n" "fmla v19.4s, v30.4s, v10.4s\n" "fmla v22.4s, v30.4s, v8.4s\n" "str q2, [x23, x26]\n" "fmla v21.4s, v30.4s, v11.4s\n" "fmla v3.4s, v26.4s, v4.4s\n" "fmla v17.4s, v26.4s, v7.4s\n" "fmla v22.4s, v26.4s, v10.4s\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v19.4s, v27.4s, v5.4s\n" "fmla v21.4s, v27.4s, v6.4s\n" "str q3, [%[outptr0], x27]\n" "fmla v18.4s, v20.4s, v4.4s\n" "str q15, [x25, %[output_col_stride1]]\n" "fmla v22.4s, v20.4s, v5.4s\n" "fmla v19.4s, v20.4s, v7.4s\n" "add %[outptr0], %[outptr0], #16\n" "str q18, [x24, x26]\n" "fmla v21.4s, v20.4s, v8.4s\n" "fmla v17.4s, v23.4s, v4.4s\n" "fmla v22.4s, v23.4s, v7.4s\n" "fmla v19.4s, v24.4s, v4.4s\n" "fmla v21.4s, v23.4s, v10.4s\n" "str q17, [x23, x27]\n" "fmla v22.4s, v16.4s, v4.4s\n" "str q19, [x25, x26]\n" "add x23, x23, #16\n" "fmla v21.4s, v24.4s, v5.4s\n" "str q22, [x24, x27]\n" "add x24, x24, #16\n" "fmla v21.4s, v16.4s, v7.4s\n" "fmla v21.4s, v13.4s, v4.4s\n" "str q21, [x25, x27]\n" "add x25, x25, #16\n" "4:\n" "cbz x13, 7f\n" "ldr s14, [%[wbptr]]\n" "mov v17.16b, v14.16b\n" "ldr s12, [%[wbptr], #4]\n" "mov v23.16b, v14.16b\n" "ldr s11, [%[wbptr], #8]\n" "mov v24.16b, v14.16b\n" "ldr s10, [%[wbptr], #12]\n" "mov v20.16b, v14.16b\n" "ldr s9, [%[wbptr], #16]\n" "mov v16.16b, v14.16b\n" "ldr s8, [%[wbptr], #20]\n" "mov v13.16b, v14.16b\n" "ldr s7, [%[wbptr], #24]\n" "mov v0.16b, v14.16b\n" "ldr s6, [%[wbptr], #28]\n" "mov v1.16b, v14.16b\n" "ldr s5, [%[wbptr], #32]\n" "mov v2.16b, v14.16b\n" "ldr s4, [%[wbptr], #36]\n" "mov v3.16b, v14.16b\n" "ldr s29, [%[inptr0]]\n" "fmla v17.4s, v29.4s, v12.4s\n" "ldr s28, [x8]\n" "ldr s30, [%[inptr0], %[input_col_stride1]]\n" "subs x13, x13, #1\n" "ldr s25, [x9]\n" "ldr s26, [x8, %[input_col_stride1]]\n" "ldr s27, [%[inptr0], x15]\n" "ldr s15, [x10]\n" "ldr s18, [x9, %[input_col_stride1]]\n" "prfm pldl1keep, [%[inptr0], #64]\n" "prfm pldl1keep, [x8, #64]\n" "prfm pldl1keep, [%[inptr0], x28]\n" "prfm pldl1keep, [x9, #64]\n" "prfm pldl1keep, [x8, x28]\n" "prfm pldl1keep, [%[inptr0], x16]\n" "prfm pldl1keep, [x10, #64]\n" "prfm pldl1keep, [x9, x28]\n" "beq 6f\n" "5:\n" "fmla v17.4s, v28.4s, v9.4s\n" "prfm pldl1keep, [x8, x16]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr s22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr s29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x11, #64]\n" "fmla v20.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x10, x28]\n" "fmla v17.4s, v25.4s, v6.4s\n" "ldr s25, [x11]\n" "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" "prfm pldl1keep, [x8, x18]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" "ldr s28, [x10, %[input_col_stride1]]\n" "fmla v24.4s, v27.4s, v11.4s\n" "prfm pldl1keep, [x12, #64]\n" "fmla v17.4s, v27.4s, v10.4s\n" "prfm pldl1keep, [x11, x28]\n" "fmla v13.4s, v27.4s, v12.4s\n" "ldr s19, [x9, x15]\n" "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr s21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" "prfm pldl1keep, [x8, x20]\n" "fmla v23.4s, v18.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x22]\n" "fmla v24.4s, v18.4s, v6.4s\n" "prfm pldl1keep, [x12, x28]\n" "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" "prfm pldl1keep, [x10, x18]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr s27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" "prfm pldl1keep, [x9, x20]\n" "fmla v23.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x8, x22]\n" "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x11, x18]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" "ldr s18, [x12]\n" "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" "prfm pldl1keep, [x12, x18]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr s22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x11, x20]\n" "fmla v0.4s, v25.4s, v9.4s\n" "ldr s25, [x10, x15]\n" "fmla v23.4s, v28.4s, v5.4s\n" "prfm pldl1keep, [x10, x22]\n" "fmla v20.4s, v28.4s, v8.4s\n" "prfm pldl1keep, [x12, x20]\n" "fmla v16.4s, v28.4s, v6.4s\n" "prfm pldl1keep, [x11, x22]\n" "fmla v0.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [x12, x22]\n" "fmla v1.4s, v28.4s, v9.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v17.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v23.4s, v19.4s, v7.4s\n" "subs x13, x13, #1\n" "fmla v24.4s, v19.4s, v5.4s\n" "fmla v20.4s, v19.4s, v10.4s\n" "str s17, [%[outptr0]]\n" "mov v15.16b, v14.16b\n" "fmla v16.4s, v19.4s, v8.4s\n" "fmla v13.4s, v19.4s, v6.4s\n" "fmla v15.4s, v28.4s, v12.4s\n" "ldr s29, [x9, x17]\n" "fmla v1.4s, v19.4s, v11.4s\n" "fmla v2.4s, v19.4s, v9.4s\n" "fmla v24.4s, v21.4s, v7.4s\n" "fmla v16.4s, v21.4s, v10.4s\n" "fmla v13.4s, v21.4s, v8.4s\n" "fmla v3.4s, v21.4s, v9.4s\n" "fmla v2.4s, v21.4s, v11.4s\n" "fmla v0.4s, v18.4s, v6.4s\n" "mov v18.16b, v14.16b\n" "fmla v20.4s, v22.4s, v5.4s\n" "fmla v13.4s, v27.4s, v10.4s\n" "fmla v3.4s, v27.4s, v11.4s\n" "mov v17.16b, v14.16b\n" "fmla v18.4s, v19.4s, v12.4s\n" "mov v19.16b, v14.16b\n" "fmla v0.4s, v22.4s, v8.4s\n" "fmla v17.4s, v21.4s, v12.4s\n" "ldr s26, [x8, x19]\n" "fmla v1.4s, v22.4s, v6.4s\n" "fmla v15.4s, v22.4s, v9.4s\n" "mov v22.16b, v14.16b\n" "mov v21.16b, v14.16b\n" "fmla v23.4s, v25.4s, v4.4s\n" "fmla v20.4s, v25.4s, v7.4s\n" "fmla v16.4s, v25.4s, v5.4s\n" "fmla v0.4s, v25.4s, v10.4s\n" "fmla v1.4s, v25.4s, v8.4s\n" "fmla v2.4s, v25.4s, v6.4s\n" "str s23, [x23]\n" "fmla v15.4s, v25.4s, v11.4s\n" "fmla v18.4s, v25.4s, v9.4s\n" "ldr s28, [%[inptr0], x21]\n" "fmla v19.4s, v25.4s, v12.4s\n" "ldr s30, [x12, %[input_col_stride1]]\n" "fmla v24.4s, v29.4s, v4.4s\n" "add %[inptr0], %[inptr0], #4\n" "fmla v16.4s, v29.4s, v7.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "fmla v13.4s, v29.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x28]\n" "str s24, [%[outptr0], %[output_col_stride1]]\n" "fmla v1.4s, v29.4s, v10.4s\n" "fmla v2.4s, v29.4s, v8.4s\n" "ldr s27, [x11, x15]\n" "fmla v3.4s, v29.4s, v6.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v18.4s, v29.4s, v11.4s\n" "fmla v17.4s, v29.4s, v9.4s\n" "fmla v22.4s, v29.4s, v12.4s\n" "ldr s23, [x10, x17]\n" "fmla v13.4s, v26.4s, v7.4s\n" "fmla v2.4s, v26.4s, v10.4s\n" "fmla v3.4s, v26.4s, v8.4s\n" "fmla v17.4s, v26.4s, v11.4s\n" "fmla v0.4s, v30.4s, v5.4s\n" "ldr s24, [x9, x19]\n" "fmla v15.4s, v30.4s, v6.4s\n" "ldr s29, [x8, x21]\n" "fmla v3.4s, v28.4s, v10.4s\n" "ldr s14, [x12, x15]\n" "fmla v20.4s, v27.4s, v4.4s\n" "add x8, x8, #4\n" "fmla v0.4s, v27.4s, v7.4s\n" "prfm pldl1keep, [x8, #64]\n" "fmla v1.4s, v27.4s, v5.4s\n" "prfm pldl1keep, [x8, x28]\n" "str s20, [x24]\n" "fmla v15.4s, v27.4s, v8.4s\n" "fmla v18.4s, v27.4s, v6.4s\n" "ldr s25, [x11, x17]\n" "fmla v19.4s, v27.4s, v9.4s\n" "ldr s30, [x10, x19]\n" "fmla v16.4s, v23.4s, v4.4s\n" "fmla v1.4s, v23.4s, v7.4s\n" "fmla v2.4s, v23.4s, v5.4s\n" "fmla v15.4s, v23.4s, v10.4s\n" "fmla v18.4s, v23.4s, v8.4s\n" "fmla v17.4s, v23.4s, v6.4s\n" "str s16, [x23, %[output_col_stride1]]\n" "fmla v19.4s, v23.4s, v11.4s\n" "fmla v22.4s, v23.4s, v9.4s\n" "ldr s26, [x9, x21]\n" "fmla v21.4s, v23.4s, v12.4s\n" "ldr s27, [x12, x17]\n" "fmla v13.4s, v24.4s, v4.4s\n" "ldr s20, [x11, x19]\n" "fmla v2.4s, v24.4s, v7.4s\n" "add x9, x9, #4\n" "fmla v3.4s, v24.4s, v5.4s\n" "prfm pldl1keep, [x9, #64]\n" "str s13, [%[outptr0], x26]\n" "fmla v18.4s, v24.4s, v10.4s\n" "fmla v17.4s, v24.4s, v8.4s\n" "ldr s23, [x10, x21]\n" "fmla v22.4s, v24.4s, v11.4s\n" "ldr s24, [x12, x19]\n" "fmla v3.4s, v29.4s, v7.4s\n" "prfm pldl1keep, [x9, x28]\n" "fmla v17.4s, v29.4s, v10.4s\n" "ldr s16, [x11, x21]\n" "fmla v0.4s, v14.4s, v4.4s\n" "add x10, x10, #4\n" "fmla v15.4s, v14.4s, v5.4s\n" "prfm pldl1keep, [x10, #64]\n" "fmla v19.4s, v14.4s, v6.4s\n" "ldr s13, [x12, x21]\n" "str s0, [x25]\n" "fmla v1.4s, v25.4s, v4.4s\n" "fmla v15.4s, v25.4s, v7.4s\n" "ldr s14, [%[wbptr]]\n" "fmla v18.4s, v25.4s, v5.4s\n" "add x11, x11, #4\n" "str s1, [x24, %[output_col_stride1]]\n" "fmla v19.4s, v25.4s, v8.4s\n" "fmla v22.4s, v25.4s, v6.4s\n" "ldr s12, [%[wbptr], #4]\n" "fmla v21.4s, v25.4s, v9.4s\n" "ldr s29, [%[inptr0]]\n" "fmla v2.4s, v30.4s, v4.4s\n" "ldr s28, [x8]\n" "fmla v18.4s, v30.4s, v7.4s\n" "add x12, x12, #4\n" "fmla v17.4s, v30.4s, v5.4s\n" "fmla v19.4s, v30.4s, v10.4s\n" "str s2, [x23, x26]\n" "fmla v22.4s, v30.4s, v8.4s\n" "fmla v21.4s, v30.4s, v11.4s\n" "ldr s9, [%[wbptr], #16]\n" "fmla v3.4s, v26.4s, v4.4s\n" "ldr s30, [%[inptr0], %[input_col_stride1]]\n" "fmla v17.4s, v26.4s, v7.4s\n" "ldr s25, [x9]\n" "fmla v22.4s, v26.4s, v10.4s\n" "ldr s11, [%[wbptr], #8]\n" "str s3, [%[outptr0], x27]\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v19.4s, v27.4s, v5.4s\n" "ldr s26, [x8, %[input_col_stride1]]\n" "fmla v21.4s, v27.4s, v6.4s\n" "ldr s27, [%[inptr0], x15]\n" "str s15, [x25, %[output_col_stride1]]\n" "fmla v18.4s, v20.4s, v4.4s\n" "fmla v19.4s, v20.4s, v7.4s\n" "ldr s15, [x10]\n" "fmla v22.4s, v20.4s, v5.4s\n" "ldr s6, [%[wbptr], #28]\n" "str s18, [x24, x26]\n" "fmla v21.4s, v20.4s, v8.4s\n" "fmla v17.4s, v23.4s, v4.4s\n" "ldr s18, [x9, %[input_col_stride1]]\n" "fmla v22.4s, v23.4s, v7.4s\n" "add %[outptr0], %[outptr0], #4\n" "fmla v21.4s, v23.4s, v10.4s\n" "ldr s8, [%[wbptr], #20]\n" "str s17, [x23, x27]\n" "fmla v19.4s, v24.4s, v4.4s\n" "fmla v22.4s, v16.4s, v4.4s\n" "add x23, x23, #4\n" "fmla v21.4s, v24.4s, v5.4s\n" "ldr s10, [%[wbptr], #12]\n" "str s19, [x25, x26]\n" "mov v17.16b, v14.16b\n" "str s22, [x24, x27]\n" "mov v23.16b, v14.16b\n" "fmla v21.4s, v16.4s, v7.4s\n" "ldr s5, [%[wbptr], #32]\n" "mov v24.16b, v14.16b\n" "add x24, x24, #4\n" "mov v20.16b, v14.16b\n" "mov v16.16b, v14.16b\n" "fmla v21.4s, v13.4s, v4.4s\n" "ldr s7, [%[wbptr], #24]\n" "mov v13.16b, v14.16b\n" "mov v0.16b, v14.16b\n" "mov v1.16b, v14.16b\n" "mov v2.16b, v14.16b\n" "str s21, [x25, x27]\n" "mov v3.16b, v14.16b\n" "ldr s4, [%[wbptr], #36]\n" "add x25, x25, #4\n" "fmla v17.4s, v29.4s, v12.4s\n" "bne 5b\n" "6:\n" "fmla v17.4s, v28.4s, v9.4s\n" "prfm pldl1keep, [x8, x16]\n" "fmla v23.4s, v28.4s, v12.4s\n" "ldr s22, [x8, x15]\n" "fmla v24.4s, v30.4s, v12.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v17.4s, v30.4s, v11.4s\n" "ldr s29, [%[inptr0], x17]\n" "fmla v23.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x11, #64]\n" "fmla v20.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x10, x28]\n" "fmla v17.4s, v25.4s, v6.4s\n" "ldr s25, [x11]\n" "fmla v23.4s, v26.4s, v11.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v24.4s, v26.4s, v9.4s\n" "prfm pldl1keep, [x8, x18]\n" "fmla v17.4s, v26.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x20]\n" "fmla v16.4s, v26.4s, v12.4s\n" "ldr s28, [x10, %[input_col_stride1]]\n" "fmla v24.4s, v27.4s, v11.4s\n" "prfm pldl1keep, [x12, #64]\n" "fmla v17.4s, v27.4s, v10.4s\n" "prfm pldl1keep, [x11, x28]\n" "fmla v13.4s, v27.4s, v12.4s\n" "ldr s19, [x9, x15]\n" "fmla v23.4s, v15.4s, v6.4s\n" "prfm pldl1keep, [x10, x16]\n" "fmla v20.4s, v15.4s, v9.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v0.4s, v15.4s, v12.4s\n" "ldr s21, [x8, x17]\n" "fmla v17.4s, v18.4s, v5.4s\n" "prfm pldl1keep, [x8, x20]\n" "fmla v23.4s, v18.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x22]\n" "fmla v24.4s, v18.4s, v6.4s\n" "prfm pldl1keep, [x12, x28]\n" "fmla v20.4s, v18.4s, v11.4s\n" "prfm pldl1keep, [x11, x16]\n" "fmla v16.4s, v18.4s, v9.4s\n" "prfm pldl1keep, [x10, x18]\n" "fmla v1.4s, v18.4s, v12.4s\n" "ldr s27, [%[inptr0], x19]\n" "fmla v17.4s, v22.4s, v7.4s\n" "prfm pldl1keep, [x9, x20]\n" "fmla v23.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x8, x22]\n" "fmla v24.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x12, x16]\n" "fmla v16.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x11, x18]\n" "fmla v13.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x20]\n" "fmla v2.4s, v22.4s, v12.4s\n" "ldr s18, [x12]\n" "fmla v24.4s, v29.4s, v10.4s\n" "prfm pldl1keep, [x9, x22]\n" "fmla v13.4s, v29.4s, v11.4s\n" "prfm pldl1keep, [x12, x18]\n" "fmla v3.4s, v29.4s, v12.4s\n" "ldr s22, [x11, %[input_col_stride1]]\n" "fmla v20.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x11, x20]\n" "fmla v0.4s, v25.4s, v9.4s\n" "ldr s25, [x10, x15]\n" "fmla v23.4s, v28.4s, v5.4s\n" "prfm pldl1keep, [x10, x22]\n" "fmla v20.4s, v28.4s, v8.4s\n" "prfm pldl1keep, [x12, x20]\n" "fmla v16.4s, v28.4s, v6.4s\n" "prfm pldl1keep, [x11, x22]\n" "fmla v0.4s, v28.4s, v11.4s\n" "prfm pldl1keep, [x12, x22]\n" "fmla v1.4s, v28.4s, v9.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v17.4s, v19.4s, v4.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v23.4s, v19.4s, v7.4s\n" "fmla v24.4s, v19.4s, v5.4s\n" "fmla v20.4s, v19.4s, v10.4s\n" "fmla v16.4s, v19.4s, v8.4s\n" "str s17, [%[outptr0]]\n" "mov v15.16b, v14.16b\n" "fmla v13.4s, v19.4s, v6.4s\n" "fmla v1.4s, v19.4s, v11.4s\n" "fmla v15.4s, v28.4s, v12.4s\n" "ldr s29, [x9, x17]\n" "fmla v2.4s, v19.4s, v9.4s\n" "fmla v24.4s, v21.4s, v7.4s\n" "fmla v16.4s, v21.4s, v10.4s\n" "fmla v13.4s, v21.4s, v8.4s\n" "fmla v3.4s, v21.4s, v9.4s\n" "fmla v0.4s, v18.4s, v6.4s\n" "mov v18.16b, v14.16b\n" "fmla v2.4s, v21.4s, v11.4s\n" "fmla v13.4s, v27.4s, v10.4s\n" "fmla v20.4s, v22.4s, v5.4s\n" "fmla v18.4s, v19.4s, v12.4s\n" "ldr s26, [x8, x19]\n" "fmla v3.4s, v27.4s, v11.4s\n" "ldr s28, [%[inptr0], x21]\n" "fmla v0.4s, v22.4s, v8.4s\n" "add %[inptr0], %[inptr0], #4\n" "fmla v1.4s, v22.4s, v6.4s\n" "fmla v15.4s, v22.4s, v9.4s\n" "mov v17.16b, v14.16b\n" "fmla v23.4s, v25.4s, v4.4s\n" "fmla v20.4s, v25.4s, v7.4s\n" "fmla v16.4s, v25.4s, v5.4s\n" "fmla v17.4s, v21.4s, v12.4s\n" "ldr s30, [x12, %[input_col_stride1]]\n" "str s23, [x23]\n" "mov v19.16b, v14.16b\n" "fmla v0.4s, v25.4s, v10.4s\n" "fmla v1.4s, v25.4s, v8.4s\n" "fmla v2.4s, v25.4s, v6.4s\n" "fmla v15.4s, v25.4s, v11.4s\n" "fmla v18.4s, v25.4s, v9.4s\n" "fmla v19.4s, v25.4s, v12.4s\n" "mov v22.16b, v14.16b\n" "mov v21.16b, v14.16b\n" "fmla v24.4s, v29.4s, v4.4s\n" "fmla v16.4s, v29.4s, v7.4s\n" "fmla v13.4s, v29.4s, v5.4s\n" "fmla v1.4s, v29.4s, v10.4s\n" "fmla v2.4s, v29.4s, v8.4s\n" "fmla v3.4s, v29.4s, v6.4s\n" "str s24, [%[outptr0], %[output_col_stride1]]\n" "fmla v18.4s, v29.4s, v11.4s\n" "fmla v17.4s, v29.4s, v9.4s\n" "ldr s27, [x11, x15]\n" "fmla v22.4s, v29.4s, v12.4s\n" "ldr s23, [x10, x17]\n" "fmla v13.4s, v26.4s, v7.4s\n" "fmla v2.4s, v26.4s, v10.4s\n" "fmla v3.4s, v26.4s, v8.4s\n" "fmla v17.4s, v26.4s, v11.4s\n" "fmla v0.4s, v30.4s, v5.4s\n" "ldr s24, [x9, x19]\n" "fmla v15.4s, v30.4s, v6.4s\n" "ldr s29, [x8, x21]\n" "fmla v3.4s, v28.4s, v10.4s\n" "ldr s14, [x12, x15]\n" "fmla v20.4s, v27.4s, v4.4s\n" "add x8, x8, #4\n" "fmla v0.4s, v27.4s, v7.4s\n" "fmla v1.4s, v27.4s, v5.4s\n" "fmla v15.4s, v27.4s, v8.4s\n" "fmla v18.4s, v27.4s, v6.4s\n" "str s20, [x24]\n" "fmla v19.4s, v27.4s, v9.4s\n" "fmla v16.4s, v23.4s, v4.4s\n" "ldr s25, [x11, x17]\n" "fmla v1.4s, v23.4s, v7.4s\n" "ldr s30, [x10, x19]\n" "fmla v2.4s, v23.4s, v5.4s\n" "fmla v15.4s, v23.4s, v10.4s\n" "str s16, [x23, %[output_col_stride1]]\n" "fmla v18.4s, v23.4s, v8.4s\n" "fmla v17.4s, v23.4s, v6.4s\n" "ldr s26, [x9, x21]\n" "fmla v19.4s, v23.4s, v11.4s\n" "add x9, x9, #4\n" "fmla v22.4s, v23.4s, v9.4s\n" "fmla v21.4s, v23.4s, v12.4s\n" "fmla v13.4s, v24.4s, v4.4s\n" "ldr s27, [x12, x17]\n" "fmla v2.4s, v24.4s, v7.4s\n" "ldr s20, [x11, x19]\n" "fmla v3.4s, v24.4s, v5.4s\n" "fmla v18.4s, v24.4s, v10.4s\n" "str s13, [%[outptr0], x26]\n" "fmla v17.4s, v24.4s, v8.4s\n" "fmla v22.4s, v24.4s, v11.4s\n" "ldr s23, [x10, x21]\n" "fmla v3.4s, v29.4s, v7.4s\n" "ldr s24, [x12, x19]\n" "fmla v17.4s, v29.4s, v10.4s\n" "ldr s16, [x11, x21]\n" "fmla v0.4s, v14.4s, v4.4s\n" "add x10, x10, #4\n" "fmla v15.4s, v14.4s, v5.4s\n" "add x11, x11, #4\n" "fmla v19.4s, v14.4s, v6.4s\n" "ldr s13, [x12, x21]\n" "str s0, [x25]\n" "fmla v1.4s, v25.4s, v4.4s\n" "fmla v15.4s, v25.4s, v7.4s\n" "add x12, x12, #4\n" "fmla v18.4s, v25.4s, v5.4s\n" "fmla v19.4s, v25.4s, v8.4s\n" "str s1, [x24, %[output_col_stride1]]\n" "fmla v22.4s, v25.4s, v6.4s\n" "fmla v21.4s, v25.4s, v9.4s\n" "fmla v2.4s, v30.4s, v4.4s\n" "fmla v18.4s, v30.4s, v7.4s\n" "fmla v17.4s, v30.4s, v5.4s\n" "fmla v19.4s, v30.4s, v10.4s\n" "fmla v22.4s, v30.4s, v8.4s\n" "str s2, [x23, x26]\n" "fmla v21.4s, v30.4s, v11.4s\n" "fmla v3.4s, v26.4s, v4.4s\n" "fmla v17.4s, v26.4s, v7.4s\n" "fmla v22.4s, v26.4s, v10.4s\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v19.4s, v27.4s, v5.4s\n" "fmla v21.4s, v27.4s, v6.4s\n" "str s3, [%[outptr0], x27]\n" "fmla v18.4s, v20.4s, v4.4s\n" "str s15, [x25, %[output_col_stride1]]\n" "fmla v22.4s, v20.4s, v5.4s\n" "fmla v19.4s, v20.4s, v7.4s\n" "add %[outptr0], %[outptr0], #4\n" "str s18, [x24, x26]\n" "fmla v21.4s, v20.4s, v8.4s\n" "fmla v17.4s, v23.4s, v4.4s\n" "fmla v22.4s, v23.4s, v7.4s\n" "fmla v19.4s, v24.4s, v4.4s\n" "fmla v21.4s, v23.4s, v10.4s\n" "str s17, [x23, x27]\n" "fmla v22.4s, v16.4s, v4.4s\n" "str s19, [x25, x26]\n" "add x23, x23, #4\n" "fmla v21.4s, v24.4s, v5.4s\n" "str s22, [x24, x27]\n" "add x24, x24, #4\n" "fmla v21.4s, v16.4s, v7.4s\n" "fmla v21.4s, v13.4s, v4.4s\n" "str s21, [x25, x27]\n" "add x25, x25, #4\n" "7:\n" : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels) : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" ); } template <> template <> void Conv::execute_tile( int n_channels, const void *weight_bias_ptr, const float *input, const unsigned int input_row_stride, const unsigned int input_col_stride, float *output, const unsigned int output_row_stride, const unsigned int output_col_stride ) { __asm __volatile( "add x9, %[inptr0], %[input_row_stride]\n" "add x28, %[input_col_stride1], %[input_col_stride1]\n" "add x16, %[outptr0], %[output_row_stride]\n" "add x24, x9, %[input_row_stride]\n" "add x25, x28, #64\n" "add x23, x28, %[input_col_stride1]\n" "add x26, x24, %[input_row_stride]\n" "add x11, x23, #64\n" "add x12, x23, %[input_col_stride1]\n" "add x10, x26, %[input_row_stride]\n" "add x13, x12, #64\n" "add x14, x12, %[input_col_stride1]\n" "add x27, x10, %[input_row_stride]\n" "add x15, x14, #64\n" "add x17, x16, %[output_row_stride]\n" "add x18, x17, %[output_row_stride]\n" "add x19, %[output_col_stride1], %[output_col_stride1]\n" "and x21, %[n_channels], #3\n" "add x20, x19, %[output_col_stride1]\n" "lsr x22, %[n_channels], #2\n" "cbz x22, 4f\n" "1:\n" "ldr q21, [%[wbptr]]\n" "subs x22, x22, #1\n" "mov v7.16b, v21.16b\n" "ldr q20, [%[wbptr], #16]\n" "mov v3.16b, v21.16b\n" "ldr q14, [%[wbptr], #32]\n" "mov v6.16b, v21.16b\n" "ldr q13, [%[wbptr], #48]\n" "mov v15.16b, v21.16b\n" "ldr q17, [%[wbptr], #64]\n" "mov v2.16b, v21.16b\n" "ldr q12, [%[wbptr], #80]\n" "mov v5.16b, v21.16b\n" "ldr q11, [%[wbptr], #96]\n" "mov v0.16b, v21.16b\n" "ldr q10, [%[wbptr], #112]\n" "mov v16.16b, v21.16b\n" "ldr q9, [%[wbptr], #128]\n" "mov v1.16b, v21.16b\n" "ldr q8, [%[wbptr], #144]\n" "mov v4.16b, v21.16b\n" "ldr q22, [%[inptr0]]\n" "fmla v7.4s, v22.4s, v20.4s\n" "ldr q19, [x9]\n" "fmla v3.4s, v19.4s, v20.4s\n" "ldr q23, [%[inptr0], %[input_col_stride1]]\n" "fmla v6.4s, v23.4s, v20.4s\n" "ldr q18, [x24]\n" "fmla v7.4s, v19.4s, v17.4s\n" "ldr q27, [x9, %[input_col_stride1]]\n" "fmla v3.4s, v18.4s, v17.4s\n" "ldr q28, [%[inptr0], x28]\n" "fmla v15.4s, v18.4s, v20.4s\n" "ldr q25, [x26]\n" "fmla v7.4s, v23.4s, v14.4s\n" "ldr q22, [x24, %[input_col_stride1]]\n" "fmla v3.4s, v27.4s, v14.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "prfm pldl1keep, [x9, #64]\n" "prfm pldl1keep, [%[inptr0], x8]\n" "fmla v7.4s, v18.4s, v10.4s\n" "prfm pldl1keep, [x24, #64]\n" "prfm pldl1keep, [x9, x8]\n" "prfm pldl1keep, [%[inptr0], x25]\n" "prfm pldl1keep, [x26, #64]\n" "prfm pldl1keep, [x24, x8]\n" "fmla v7.4s, v27.4s, v12.4s\n" "beq 3f\n" "2:\n" "mov v18.16b, v21.16b\n" "ldr q23, [x9, x28]\n" "mov v19.16b, v21.16b\n" "prfm pldl1keep, [x9, x25]\n" "fmla v6.4s, v27.4s, v17.4s\n" "prfm pldl1keep, [%[inptr0], x11]\n" "fmla v2.4s, v27.4s, v20.4s\n" "ldr q24, [%[inptr0], x23]\n" "fmla v7.4s, v28.4s, v13.4s\n" "prfm pldl1keep, [x10, #64]\n" "fmla v6.4s, v28.4s, v14.4s\n" "prfm pldl1keep, [x26, x8]\n" "fmla v5.4s, v28.4s, v20.4s\n" "ldr q26, [x10]\n" "fmla v3.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x25]\n" "fmla v15.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x9, x11]\n" "fmla v0.4s, v25.4s, v20.4s\n" "ldr q25, [x26, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [%[inptr0], x13]\n" "fmla v3.4s, v22.4s, v12.4s\n" "prfm pldl1keep, [x27, #64]\n" "fmla v6.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x10, x8]\n" "fmla v15.4s, v22.4s, v14.4s\n" "prfm pldl1keep, [x26, x25]\n" "fmla v2.4s, v22.4s, v17.4s\n" "prfm pldl1keep, [x24, x11]\n" "fmla v16.4s, v22.4s, v20.4s\n" "ldr q22, [x24, x28]\n" "fmla v7.4s, v23.4s, v11.4s\n" "prfm pldl1keep, [x9, x13]\n" "fmla v3.4s, v23.4s, v13.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v6.4s, v23.4s, v12.4s\n" "prfm pldl1keep, [x27, x8]\n" "fmla v2.4s, v23.4s, v14.4s\n" "prfm pldl1keep, [x10, x25]\n" "fmla v5.4s, v23.4s, v17.4s\n" "prfm pldl1keep, [x26, x11]\n" "fmla v1.4s, v23.4s, v20.4s\n" "ldr q23, [x9, x23]\n" "fmla v6.4s, v24.4s, v13.4s\n" "prfm pldl1keep, [x24, x13]\n" "fmla v5.4s, v24.4s, v14.4s\n" "prfm pldl1keep, [x9, x15]\n" "fmla v4.4s, v24.4s, v20.4s\n" "ldr q24, [%[inptr0], x12]\n" "fmla v15.4s, v26.4s, v10.4s\n" "prfm pldl1keep, [x27, x25]\n" "fmla v0.4s, v26.4s, v17.4s\n" "ldr q29, [x27]\n" "fmla v3.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x10, x11]\n" "fmla v15.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x26, x13]\n" "fmla v2.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x15]\n" "fmla v0.4s, v25.4s, v14.4s\n" "prfm pldl1keep, [x27, x11]\n" "fmla v16.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x10, x13]\n" "fmla v18.4s, v25.4s, v20.4s\n" "ldr q26, [x10, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x26, x15]\n" "fmla v3.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x27, x13]\n" "fmla v6.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x15]\n" "fmla v15.4s, v22.4s, v13.4s\n" "prfm pldl1keep, [x27, x15]\n" "fmla v2.4s, v22.4s, v12.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v5.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v16.4s, v22.4s, v14.4s\n" "subs x22, x22, #1\n" "fmla v1.4s, v22.4s, v17.4s\n" "fmla v19.4s, v22.4s, v20.4s\n" "mov v22.16b, v21.16b\n" "fmla v6.4s, v23.4s, v11.4s\n" "fmla v2.4s, v23.4s, v13.4s\n" "fmla v5.4s, v23.4s, v12.4s\n" "fmla v1.4s, v23.4s, v14.4s\n" "fmla v4.4s, v23.4s, v17.4s\n" "fmla v22.4s, v23.4s, v20.4s\n" "ldr q27, [x26, x28]\n" "fmla v5.4s, v24.4s, v13.4s\n" "fmla v0.4s, v29.4s, v10.4s\n" "mov v23.16b, v21.16b\n" "fmla v4.4s, v24.4s, v14.4s\n" "mov v25.16b, v21.16b\n" "mov v24.16b, v21.16b\n" "fmla v15.4s, v26.4s, v9.4s\n" "fmla v0.4s, v26.4s, v12.4s\n" "fmla v16.4s, v26.4s, v10.4s\n" "fmla v18.4s, v26.4s, v17.4s\n" "fmla v3.4s, v27.4s, v8.4s\n" "ldr q29, [x24, x23]\n" "fmla v15.4s, v27.4s, v11.4s\n" "fmla v2.4s, v27.4s, v9.4s\n" "fmla v0.4s, v27.4s, v13.4s\n" "fmla v16.4s, v27.4s, v12.4s\n" "fmla v1.4s, v27.4s, v10.4s\n" "fmla v18.4s, v27.4s, v14.4s\n" "fmla v19.4s, v27.4s, v17.4s\n" "fmla v23.4s, v27.4s, v20.4s\n" "fmla v6.4s, v29.4s, v8.4s\n" "ldr q28, [x9, x12]\n" "fmla v2.4s, v29.4s, v11.4s\n" "fmla v5.4s, v29.4s, v9.4s\n" "fmla v16.4s, v29.4s, v13.4s\n" "fmla v1.4s, v29.4s, v12.4s\n" "fmla v4.4s, v29.4s, v10.4s\n" "fmla v19.4s, v29.4s, v14.4s\n" "fmla v22.4s, v29.4s, v17.4s\n" "fmla v25.4s, v29.4s, v20.4s\n" "fmla v5.4s, v28.4s, v11.4s\n" "ldr q21, [%[inptr0], x14]\n" "fmla v1.4s, v28.4s, v13.4s\n" "add %[inptr0], %[inptr0], #16\n" "fmla v4.4s, v28.4s, v12.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "fmla v22.4s, v28.4s, v14.4s\n" "ldr q26, [x27, %[input_col_stride1]]\n" "fmla v0.4s, v26.4s, v9.4s\n" "prfm pldl1keep, [%[inptr0], x8]\n" "fmla v4.4s, v21.4s, v13.4s\n" "ldr q21, [x10, x28]\n" "fmla v18.4s, v26.4s, v10.4s\n" "ldr q29, [x26, x23]\n" "fmla v15.4s, v21.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x25]\n" "fmla v0.4s, v21.4s, v11.4s\n" "fmla v16.4s, v21.4s, v9.4s\n" "fmla v18.4s, v21.4s, v12.4s\n" "fmla v19.4s, v21.4s, v10.4s\n" "fmla v23.4s, v21.4s, v17.4s\n" "ldr q21, [x24, x12]\n" "fmla v2.4s, v29.4s, v8.4s\n" "fmla v16.4s, v29.4s, v11.4s\n" "fmla v1.4s, v29.4s, v9.4s\n" "fmla v18.4s, v29.4s, v13.4s\n" "fmla v19.4s, v29.4s, v12.4s\n" "fmla v22.4s, v29.4s, v10.4s\n" "fmla v23.4s, v29.4s, v14.4s\n" "fmla v25.4s, v29.4s, v17.4s\n" "fmla v24.4s, v29.4s, v20.4s\n" "ldr q28, [x9, x14]\n" "fmla v5.4s, v21.4s, v8.4s\n" "ldr q27, [x27, x28]\n" "fmla v1.4s, v21.4s, v11.4s\n" "add x9, x9, #16\n" "fmla v4.4s, v21.4s, v9.4s\n" "prfm pldl1keep, [x9, #64]\n" "fmla v19.4s, v21.4s, v13.4s\n" "prfm pldl1keep, [x9, x8]\n" "fmla v22.4s, v21.4s, v12.4s\n" "fmla v25.4s, v21.4s, v14.4s\n" "fmla v4.4s, v28.4s, v11.4s\n" "ldr q20, [x10, x23]\n" "fmla v0.4s, v27.4s, v8.4s\n" "fmla v18.4s, v27.4s, v9.4s\n" "fmla v22.4s, v28.4s, v13.4s\n" "ldr q26, [x26, x12]\n" "fmla v23.4s, v27.4s, v10.4s\n" "ldr q21, [x24, x14]\n" "fmla v16.4s, v20.4s, v8.4s\n" "add x24, x24, #16\n" "fmla v18.4s, v20.4s, v11.4s\n" "prfm pldl1keep, [x24, #64]\n" "fmla v19.4s, v20.4s, v9.4s\n" "prfm pldl1keep, [x24, x8]\n" "fmla v23.4s, v20.4s, v12.4s\n" "fmla v25.4s, v20.4s, v10.4s\n" "fmla v24.4s, v20.4s, v17.4s\n" "ldr q28, [x27, x23]\n" "fmla v1.4s, v26.4s, v8.4s\n" "ldr q20, [x10, x12]\n" "fmla v19.4s, v26.4s, v11.4s\n" "fmla v22.4s, v26.4s, v9.4s\n" "fmla v23.4s, v26.4s, v13.4s\n" "fmla v25.4s, v26.4s, v12.4s\n" "fmla v24.4s, v26.4s, v14.4s\n" "ldr q17, [x26, x14]\n" "fmla v4.4s, v21.4s, v8.4s\n" "ldr q26, [x27, x12]\n" "fmla v22.4s, v21.4s, v11.4s\n" "add x26, x26, #16\n" "fmla v25.4s, v21.4s, v13.4s\n" "ldr q27, [x10, x14]\n" "fmla v18.4s, v28.4s, v8.4s\n" "prfm pldl1keep, [x26, #64]\n" "fmla v23.4s, v28.4s, v9.4s\n" "add x10, x10, #16\n" "fmla v24.4s, v28.4s, v10.4s\n" "ldr q28, [x27, x14]\n" "fmla v19.4s, v20.4s, v8.4s\n" "ldr q21, [%[wbptr]]\n" "fmla v23.4s, v20.4s, v11.4s\n" "add x27, x27, #16\n" "fmla v25.4s, v20.4s, v9.4s\n" "fmla v24.4s, v20.4s, v12.4s\n" "fmla v22.4s, v17.4s, v8.4s\n" "ldr q20, [%[wbptr], #16]\n" "fmla v23.4s, v26.4s, v8.4s\n" "ldr q14, [%[wbptr], #32]\n" "fmla v24.4s, v17.4s, v13.4s\n" "movi v29.16b, #0\n" "fmla v25.4s, v17.4s, v11.4s\n" "ldr q17, [%[wbptr], #64]\n" "fmax v7.4s, v7.4s, v29.4s\n" "fmax v6.4s, v6.4s, v29.4s\n" "fmla v24.4s, v26.4s, v9.4s\n" "ldr q13, [%[wbptr], #48]\n" "str q7, [%[outptr0]]\n" "fmla v25.4s, v27.4s, v8.4s\n" "str q6, [%[outptr0], %[output_col_stride1]]\n" "fmax v5.4s, v5.4s, v29.4s\n" "fmla v24.4s, v27.4s, v11.4s\n" "ldr q12, [%[wbptr], #80]\n" "str q5, [%[outptr0], x19]\n" "fmax v4.4s, v4.4s, v29.4s\n" "fmax v3.4s, v3.4s, v29.4s\n" "ldr q10, [%[wbptr], #112]\n" "str q4, [%[outptr0], x20]\n" "fmla v24.4s, v28.4s, v8.4s\n" "str q3, [x16]\n" "fmax v2.4s, v2.4s, v29.4s\n" "fmax v1.4s, v1.4s, v29.4s\n" "ldr q11, [%[wbptr], #96]\n" "str q2, [x16, %[output_col_stride1]]\n" "fmax v22.4s, v22.4s, v29.4s\n" "str q1, [x16, x19]\n" "fmax v15.4s, v15.4s, v29.4s\n" "str q22, [x16, x20]\n" "fmax v16.4s, v16.4s, v29.4s\n" "str q15, [x17]\n" "fmax v19.4s, v19.4s, v29.4s\n" "str q16, [x17, %[output_col_stride1]]\n" "fmax v25.4s, v25.4s, v29.4s\n" "str q19, [x17, x19]\n" "fmax v0.4s, v0.4s, v29.4s\n" "str q25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" "str q0, [x18]\n" "fmax v23.4s, v23.4s, v29.4s\n" "str q18, [x18, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" "str q23, [x18, x19]\n" "mov v7.16b, v21.16b\n" "str q24, [x18, x20]\n" "mov v3.16b, v21.16b\n" "mov v6.16b, v21.16b\n" "ldr q9, [%[wbptr], #128]\n" "mov v15.16b, v21.16b\n" "ldr q8, [%[wbptr], #144]\n" "mov v2.16b, v21.16b\n" "ldr q22, [%[inptr0]]\n" "mov v5.16b, v21.16b\n" "ldr q19, [x9]\n" "mov v0.16b, v21.16b\n" "ldr q23, [%[inptr0], %[input_col_stride1]]\n" "mov v16.16b, v21.16b\n" "ldr q18, [x24]\n" "mov v1.16b, v21.16b\n" "ldr q27, [x9, %[input_col_stride1]]\n" "mov v4.16b, v21.16b\n" "ldr q28, [%[inptr0], x28]\n" "fmla v7.4s, v22.4s, v20.4s\n" "ldr q25, [x26]\n" "fmla v3.4s, v19.4s, v20.4s\n" "ldr q22, [x24, %[input_col_stride1]]\n" "fmla v6.4s, v23.4s, v20.4s\n" "add %[outptr0], %[outptr0], #16\n" "fmla v7.4s, v19.4s, v17.4s\n" "add x16, x16, #16\n" "fmla v3.4s, v18.4s, v17.4s\n" "add x17, x17, #16\n" "fmla v15.4s, v18.4s, v20.4s\n" "add x18, x18, #16\n" "fmla v7.4s, v23.4s, v14.4s\n" "fmla v3.4s, v27.4s, v14.4s\n" "fmla v7.4s, v18.4s, v10.4s\n" "fmla v7.4s, v27.4s, v12.4s\n" "bne 2b\n" "3:\n" "mov v18.16b, v21.16b\n" "ldr q23, [x9, x28]\n" "mov v19.16b, v21.16b\n" "prfm pldl1keep, [x9, x25]\n" "fmla v6.4s, v27.4s, v17.4s\n" "prfm pldl1keep, [%[inptr0], x11]\n" "fmla v2.4s, v27.4s, v20.4s\n" "ldr q24, [%[inptr0], x23]\n" "fmla v7.4s, v28.4s, v13.4s\n" "prfm pldl1keep, [x10, #64]\n" "fmla v6.4s, v28.4s, v14.4s\n" "prfm pldl1keep, [x26, x8]\n" "fmla v5.4s, v28.4s, v20.4s\n" "ldr q26, [x10]\n" "fmla v3.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x25]\n" "fmla v15.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x9, x11]\n" "fmla v0.4s, v25.4s, v20.4s\n" "ldr q25, [x26, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [%[inptr0], x13]\n" "fmla v3.4s, v22.4s, v12.4s\n" "prfm pldl1keep, [x27, #64]\n" "fmla v6.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x10, x8]\n" "fmla v15.4s, v22.4s, v14.4s\n" "prfm pldl1keep, [x26, x25]\n" "fmla v2.4s, v22.4s, v17.4s\n" "prfm pldl1keep, [x24, x11]\n" "fmla v16.4s, v22.4s, v20.4s\n" "ldr q22, [x24, x28]\n" "fmla v7.4s, v23.4s, v11.4s\n" "prfm pldl1keep, [x9, x13]\n" "fmla v3.4s, v23.4s, v13.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v6.4s, v23.4s, v12.4s\n" "prfm pldl1keep, [x27, x8]\n" "fmla v2.4s, v23.4s, v14.4s\n" "prfm pldl1keep, [x10, x25]\n" "fmla v5.4s, v23.4s, v17.4s\n" "prfm pldl1keep, [x26, x11]\n" "fmla v1.4s, v23.4s, v20.4s\n" "ldr q23, [x9, x23]\n" "fmla v6.4s, v24.4s, v13.4s\n" "prfm pldl1keep, [x24, x13]\n" "fmla v5.4s, v24.4s, v14.4s\n" "prfm pldl1keep, [x9, x15]\n" "fmla v4.4s, v24.4s, v20.4s\n" "ldr q24, [%[inptr0], x12]\n" "fmla v15.4s, v26.4s, v10.4s\n" "prfm pldl1keep, [x27, x25]\n" "fmla v0.4s, v26.4s, v17.4s\n" "ldr q29, [x27]\n" "fmla v3.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x10, x11]\n" "fmla v15.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x26, x13]\n" "fmla v2.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x15]\n" "fmla v0.4s, v25.4s, v14.4s\n" "prfm pldl1keep, [x27, x11]\n" "fmla v16.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x10, x13]\n" "fmla v18.4s, v25.4s, v20.4s\n" "ldr q26, [x10, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x26, x15]\n" "fmla v3.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x27, x13]\n" "fmla v6.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x15]\n" "fmla v15.4s, v22.4s, v13.4s\n" "prfm pldl1keep, [x27, x15]\n" "fmla v2.4s, v22.4s, v12.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v5.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v16.4s, v22.4s, v14.4s\n" "fmla v1.4s, v22.4s, v17.4s\n" "fmla v19.4s, v22.4s, v20.4s\n" "ldr q27, [x26, x28]\n" "fmla v6.4s, v23.4s, v11.4s\n" "fmla v2.4s, v23.4s, v13.4s\n" "fmla v5.4s, v23.4s, v12.4s\n" "fmla v1.4s, v23.4s, v14.4s\n" "fmla v4.4s, v23.4s, v17.4s\n" "fmla v0.4s, v29.4s, v10.4s\n" "mov v22.16b, v21.16b\n" "fmla v15.4s, v26.4s, v9.4s\n" "fmla v5.4s, v24.4s, v13.4s\n" "fmla v16.4s, v26.4s, v10.4s\n" "fmla v22.4s, v23.4s, v20.4s\n" "ldr q29, [x24, x23]\n" "fmla v4.4s, v24.4s, v14.4s\n" "ldr q28, [x9, x12]\n" "fmla v0.4s, v26.4s, v12.4s\n" "fmla v18.4s, v26.4s, v17.4s\n" "mov v23.16b, v21.16b\n" "fmla v3.4s, v27.4s, v8.4s\n" "fmla v15.4s, v27.4s, v11.4s\n" "fmla v2.4s, v27.4s, v9.4s\n" "fmla v0.4s, v27.4s, v13.4s\n" "fmla v16.4s, v27.4s, v12.4s\n" "fmla v1.4s, v27.4s, v10.4s\n" "fmla v18.4s, v27.4s, v14.4s\n" "fmla v19.4s, v27.4s, v17.4s\n" "fmla v23.4s, v27.4s, v20.4s\n" "mov v25.16b, v21.16b\n" "mov v24.16b, v21.16b\n" "fmla v6.4s, v29.4s, v8.4s\n" "fmla v2.4s, v29.4s, v11.4s\n" "fmla v5.4s, v29.4s, v9.4s\n" "fmla v16.4s, v29.4s, v13.4s\n" "fmla v1.4s, v29.4s, v12.4s\n" "fmla v4.4s, v29.4s, v10.4s\n" "fmla v19.4s, v29.4s, v14.4s\n" "fmla v22.4s, v29.4s, v17.4s\n" "fmla v25.4s, v29.4s, v20.4s\n" "ldr q21, [%[inptr0], x14]\n" "fmla v5.4s, v28.4s, v11.4s\n" "add %[inptr0], %[inptr0], #16\n" "fmla v1.4s, v28.4s, v13.4s\n" "fmla v4.4s, v28.4s, v12.4s\n" "fmla v22.4s, v28.4s, v14.4s\n" "ldr q26, [x27, %[input_col_stride1]]\n" "fmla v0.4s, v26.4s, v9.4s\n" "fmla v18.4s, v26.4s, v10.4s\n" "fmla v4.4s, v21.4s, v13.4s\n" "ldr q21, [x10, x28]\n" "fmla v15.4s, v21.4s, v8.4s\n" "ldr q29, [x26, x23]\n" "fmla v0.4s, v21.4s, v11.4s\n" "fmla v16.4s, v21.4s, v9.4s\n" "fmla v18.4s, v21.4s, v12.4s\n" "fmla v19.4s, v21.4s, v10.4s\n" "fmla v23.4s, v21.4s, v17.4s\n" "ldr q21, [x24, x12]\n" "fmla v2.4s, v29.4s, v8.4s\n" "fmla v16.4s, v29.4s, v11.4s\n" "fmla v1.4s, v29.4s, v9.4s\n" "fmla v18.4s, v29.4s, v13.4s\n" "fmla v19.4s, v29.4s, v12.4s\n" "fmla v22.4s, v29.4s, v10.4s\n" "fmla v23.4s, v29.4s, v14.4s\n" "fmla v25.4s, v29.4s, v17.4s\n" "fmla v24.4s, v29.4s, v20.4s\n" "ldr q28, [x9, x14]\n" "fmla v5.4s, v21.4s, v8.4s\n" "ldr q27, [x27, x28]\n" "fmla v1.4s, v21.4s, v11.4s\n" "add x9, x9, #16\n" "fmla v4.4s, v21.4s, v9.4s\n" "fmla v19.4s, v21.4s, v13.4s\n" "fmla v22.4s, v21.4s, v12.4s\n" "fmla v25.4s, v21.4s, v14.4s\n" "fmla v0.4s, v27.4s, v8.4s\n" "ldr q20, [x10, x23]\n" "fmla v4.4s, v28.4s, v11.4s\n" "fmla v18.4s, v27.4s, v9.4s\n" "fmla v22.4s, v28.4s, v13.4s\n" "ldr q26, [x26, x12]\n" "fmla v23.4s, v27.4s, v10.4s\n" "ldr q21, [x24, x14]\n" "fmla v16.4s, v20.4s, v8.4s\n" "add x24, x24, #16\n" "fmla v18.4s, v20.4s, v11.4s\n" "fmla v19.4s, v20.4s, v9.4s\n" "fmla v23.4s, v20.4s, v12.4s\n" "fmla v25.4s, v20.4s, v10.4s\n" "fmla v24.4s, v20.4s, v17.4s\n" "ldr q28, [x27, x23]\n" "fmla v1.4s, v26.4s, v8.4s\n" "ldr q20, [x10, x12]\n" "fmla v19.4s, v26.4s, v11.4s\n" "fmla v22.4s, v26.4s, v9.4s\n" "fmla v23.4s, v26.4s, v13.4s\n" "fmla v25.4s, v26.4s, v12.4s\n" "fmla v24.4s, v26.4s, v14.4s\n" "ldr q17, [x26, x14]\n" "fmla v4.4s, v21.4s, v8.4s\n" "ldr q26, [x27, x12]\n" "fmla v22.4s, v21.4s, v11.4s\n" "add x26, x26, #16\n" "fmla v25.4s, v21.4s, v13.4s\n" "ldr q27, [x10, x14]\n" "fmla v18.4s, v28.4s, v8.4s\n" "add x10, x10, #16\n" "fmla v23.4s, v28.4s, v9.4s\n" "fmla v24.4s, v28.4s, v10.4s\n" "fmla v19.4s, v20.4s, v8.4s\n" "ldr q28, [x27, x14]\n" "fmla v25.4s, v20.4s, v9.4s\n" "add x27, x27, #16\n" "fmla v23.4s, v20.4s, v11.4s\n" "fmla v24.4s, v20.4s, v12.4s\n" "fmla v22.4s, v17.4s, v8.4s\n" "movi v29.16b, #0\n" "fmla v25.4s, v17.4s, v11.4s\n" "fmla v24.4s, v17.4s, v13.4s\n" "fmla v23.4s, v26.4s, v8.4s\n" "fmax v7.4s, v7.4s, v29.4s\n" "fmla v25.4s, v27.4s, v8.4s\n" "fmax v6.4s, v6.4s, v29.4s\n" "str q7, [%[outptr0]]\n" "fmla v24.4s, v26.4s, v9.4s\n" "str q6, [%[outptr0], %[output_col_stride1]]\n" "fmax v5.4s, v5.4s, v29.4s\n" "fmax v4.4s, v4.4s, v29.4s\n" "fmax v3.4s, v3.4s, v29.4s\n" "str q5, [%[outptr0], x19]\n" "fmla v24.4s, v27.4s, v11.4s\n" "str q4, [%[outptr0], x20]\n" "fmax v2.4s, v2.4s, v29.4s\n" "str q3, [x16]\n" "fmax v1.4s, v1.4s, v29.4s\n" "str q2, [x16, %[output_col_stride1]]\n" "fmla v24.4s, v28.4s, v8.4s\n" "str q1, [x16, x19]\n" "fmax v22.4s, v22.4s, v29.4s\n" "fmax v15.4s, v15.4s, v29.4s\n" "add %[outptr0], %[outptr0], #16\n" "str q22, [x16, x20]\n" "fmax v16.4s, v16.4s, v29.4s\n" "str q15, [x17]\n" "fmax v19.4s, v19.4s, v29.4s\n" "str q16, [x17, %[output_col_stride1]]\n" "fmax v25.4s, v25.4s, v29.4s\n" "str q19, [x17, x19]\n" "fmax v0.4s, v0.4s, v29.4s\n" "str q25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" "str q0, [x18]\n" "fmax v23.4s, v23.4s, v29.4s\n" "str q18, [x18, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" "str q23, [x18, x19]\n" "add x16, x16, #16\n" "str q24, [x18, x20]\n" "add x17, x17, #16\n" "add x18, x18, #16\n" "4:\n" "cbz x21, 7f\n" "ldr s21, [%[wbptr]]\n" "mov v7.16b, v21.16b\n" "ldr s20, [%[wbptr], #4]\n" "mov v3.16b, v21.16b\n" "ldr s14, [%[wbptr], #8]\n" "mov v6.16b, v21.16b\n" "ldr s13, [%[wbptr], #12]\n" "mov v15.16b, v21.16b\n" "ldr s17, [%[wbptr], #16]\n" "mov v2.16b, v21.16b\n" "ldr s12, [%[wbptr], #20]\n" "mov v5.16b, v21.16b\n" "ldr s11, [%[wbptr], #24]\n" "mov v0.16b, v21.16b\n" "ldr s10, [%[wbptr], #28]\n" "mov v16.16b, v21.16b\n" "ldr s9, [%[wbptr], #32]\n" "mov v1.16b, v21.16b\n" "ldr s8, [%[wbptr], #36]\n" "mov v4.16b, v21.16b\n" "ldr s22, [%[inptr0]]\n" "fmla v7.4s, v22.4s, v20.4s\n" "ldr s19, [x9]\n" "fmla v3.4s, v19.4s, v20.4s\n" "ldr s23, [%[inptr0], %[input_col_stride1]]\n" "fmla v6.4s, v23.4s, v20.4s\n" "ldr s18, [x24]\n" "fmla v7.4s, v19.4s, v17.4s\n" "ldr s27, [x9, %[input_col_stride1]]\n" "fmla v3.4s, v18.4s, v17.4s\n" "ldr s28, [%[inptr0], x28]\n" "fmla v15.4s, v18.4s, v20.4s\n" "ldr s25, [x26]\n" "fmla v7.4s, v23.4s, v14.4s\n" "ldr s22, [x24, %[input_col_stride1]]\n" "fmla v3.4s, v27.4s, v14.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "prfm pldl1keep, [x9, #64]\n" "subs x21, x21, #1\n" "prfm pldl1keep, [%[inptr0], x8]\n" "prfm pldl1keep, [x24, #64]\n" "fmla v7.4s, v18.4s, v10.4s\n" "prfm pldl1keep, [x9, x8]\n" "prfm pldl1keep, [%[inptr0], x25]\n" "prfm pldl1keep, [x26, #64]\n" "prfm pldl1keep, [x24, x8]\n" "fmla v7.4s, v27.4s, v12.4s\n" "beq 6f\n" "5:\n" "mov v18.16b, v21.16b\n" "ldr s23, [x9, x28]\n" "mov v19.16b, v21.16b\n" "prfm pldl1keep, [x9, x25]\n" "fmla v6.4s, v27.4s, v17.4s\n" "prfm pldl1keep, [%[inptr0], x11]\n" "fmla v2.4s, v27.4s, v20.4s\n" "ldr s24, [%[inptr0], x23]\n" "fmla v7.4s, v28.4s, v13.4s\n" "prfm pldl1keep, [x10, #64]\n" "fmla v6.4s, v28.4s, v14.4s\n" "prfm pldl1keep, [x26, x8]\n" "fmla v5.4s, v28.4s, v20.4s\n" "ldr s26, [x10]\n" "fmla v3.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x25]\n" "fmla v15.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x9, x11]\n" "fmla v0.4s, v25.4s, v20.4s\n" "ldr s25, [x26, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [%[inptr0], x13]\n" "fmla v3.4s, v22.4s, v12.4s\n" "prfm pldl1keep, [x27, #64]\n" "fmla v6.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x10, x8]\n" "fmla v15.4s, v22.4s, v14.4s\n" "prfm pldl1keep, [x26, x25]\n" "fmla v2.4s, v22.4s, v17.4s\n" "prfm pldl1keep, [x24, x11]\n" "fmla v16.4s, v22.4s, v20.4s\n" "ldr s22, [x24, x28]\n" "fmla v7.4s, v23.4s, v11.4s\n" "prfm pldl1keep, [x9, x13]\n" "fmla v3.4s, v23.4s, v13.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v6.4s, v23.4s, v12.4s\n" "prfm pldl1keep, [x27, x8]\n" "fmla v2.4s, v23.4s, v14.4s\n" "prfm pldl1keep, [x10, x25]\n" "fmla v5.4s, v23.4s, v17.4s\n" "prfm pldl1keep, [x26, x11]\n" "fmla v1.4s, v23.4s, v20.4s\n" "ldr s23, [x9, x23]\n" "fmla v6.4s, v24.4s, v13.4s\n" "prfm pldl1keep, [x24, x13]\n" "fmla v5.4s, v24.4s, v14.4s\n" "prfm pldl1keep, [x9, x15]\n" "fmla v4.4s, v24.4s, v20.4s\n" "ldr s24, [%[inptr0], x12]\n" "fmla v15.4s, v26.4s, v10.4s\n" "prfm pldl1keep, [x27, x25]\n" "fmla v0.4s, v26.4s, v17.4s\n" "ldr s29, [x27]\n" "fmla v3.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x10, x11]\n" "fmla v15.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x26, x13]\n" "fmla v2.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x15]\n" "fmla v0.4s, v25.4s, v14.4s\n" "prfm pldl1keep, [x27, x11]\n" "fmla v16.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x10, x13]\n" "fmla v18.4s, v25.4s, v20.4s\n" "ldr s26, [x10, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x26, x15]\n" "fmla v3.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x27, x13]\n" "fmla v6.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x15]\n" "fmla v15.4s, v22.4s, v13.4s\n" "prfm pldl1keep, [x27, x15]\n" "fmla v2.4s, v22.4s, v12.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v5.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v16.4s, v22.4s, v14.4s\n" "subs x21, x21, #1\n" "fmla v1.4s, v22.4s, v17.4s\n" "fmla v19.4s, v22.4s, v20.4s\n" "mov v22.16b, v21.16b\n" "fmla v6.4s, v23.4s, v11.4s\n" "fmla v2.4s, v23.4s, v13.4s\n" "fmla v5.4s, v23.4s, v12.4s\n" "fmla v1.4s, v23.4s, v14.4s\n" "fmla v4.4s, v23.4s, v17.4s\n" "fmla v22.4s, v23.4s, v20.4s\n" "ldr s27, [x26, x28]\n" "fmla v5.4s, v24.4s, v13.4s\n" "fmla v0.4s, v29.4s, v10.4s\n" "mov v23.16b, v21.16b\n" "fmla v4.4s, v24.4s, v14.4s\n" "mov v25.16b, v21.16b\n" "mov v24.16b, v21.16b\n" "fmla v15.4s, v26.4s, v9.4s\n" "fmla v0.4s, v26.4s, v12.4s\n" "fmla v16.4s, v26.4s, v10.4s\n" "fmla v18.4s, v26.4s, v17.4s\n" "fmla v3.4s, v27.4s, v8.4s\n" "ldr s29, [x24, x23]\n" "fmla v15.4s, v27.4s, v11.4s\n" "fmla v2.4s, v27.4s, v9.4s\n" "fmla v0.4s, v27.4s, v13.4s\n" "fmla v16.4s, v27.4s, v12.4s\n" "fmla v1.4s, v27.4s, v10.4s\n" "fmla v18.4s, v27.4s, v14.4s\n" "fmla v19.4s, v27.4s, v17.4s\n" "fmla v23.4s, v27.4s, v20.4s\n" "fmla v6.4s, v29.4s, v8.4s\n" "ldr s28, [x9, x12]\n" "fmla v2.4s, v29.4s, v11.4s\n" "fmla v5.4s, v29.4s, v9.4s\n" "fmla v16.4s, v29.4s, v13.4s\n" "fmla v1.4s, v29.4s, v12.4s\n" "fmla v4.4s, v29.4s, v10.4s\n" "fmla v19.4s, v29.4s, v14.4s\n" "fmla v22.4s, v29.4s, v17.4s\n" "fmla v25.4s, v29.4s, v20.4s\n" "fmla v5.4s, v28.4s, v11.4s\n" "ldr s21, [%[inptr0], x14]\n" "fmla v1.4s, v28.4s, v13.4s\n" "add %[inptr0], %[inptr0], #4\n" "fmla v4.4s, v28.4s, v12.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "fmla v22.4s, v28.4s, v14.4s\n" "ldr s26, [x27, %[input_col_stride1]]\n" "fmla v0.4s, v26.4s, v9.4s\n" "prfm pldl1keep, [%[inptr0], x8]\n" "fmla v4.4s, v21.4s, v13.4s\n" "ldr s21, [x10, x28]\n" "fmla v18.4s, v26.4s, v10.4s\n" "ldr s29, [x26, x23]\n" "fmla v15.4s, v21.4s, v8.4s\n" "prfm pldl1keep, [%[inptr0], x25]\n" "fmla v0.4s, v21.4s, v11.4s\n" "fmla v16.4s, v21.4s, v9.4s\n" "fmla v18.4s, v21.4s, v12.4s\n" "fmla v19.4s, v21.4s, v10.4s\n" "fmla v23.4s, v21.4s, v17.4s\n" "ldr s21, [x24, x12]\n" "fmla v2.4s, v29.4s, v8.4s\n" "fmla v16.4s, v29.4s, v11.4s\n" "fmla v1.4s, v29.4s, v9.4s\n" "fmla v18.4s, v29.4s, v13.4s\n" "fmla v19.4s, v29.4s, v12.4s\n" "fmla v22.4s, v29.4s, v10.4s\n" "fmla v23.4s, v29.4s, v14.4s\n" "fmla v25.4s, v29.4s, v17.4s\n" "fmla v24.4s, v29.4s, v20.4s\n" "ldr s28, [x9, x14]\n" "fmla v5.4s, v21.4s, v8.4s\n" "ldr s27, [x27, x28]\n" "fmla v1.4s, v21.4s, v11.4s\n" "add x9, x9, #4\n" "fmla v4.4s, v21.4s, v9.4s\n" "prfm pldl1keep, [x9, #64]\n" "fmla v19.4s, v21.4s, v13.4s\n" "prfm pldl1keep, [x9, x8]\n" "fmla v22.4s, v21.4s, v12.4s\n" "fmla v25.4s, v21.4s, v14.4s\n" "fmla v4.4s, v28.4s, v11.4s\n" "ldr s20, [x10, x23]\n" "fmla v0.4s, v27.4s, v8.4s\n" "fmla v18.4s, v27.4s, v9.4s\n" "fmla v22.4s, v28.4s, v13.4s\n" "ldr s26, [x26, x12]\n" "fmla v23.4s, v27.4s, v10.4s\n" "ldr s21, [x24, x14]\n" "fmla v16.4s, v20.4s, v8.4s\n" "add x24, x24, #4\n" "fmla v18.4s, v20.4s, v11.4s\n" "prfm pldl1keep, [x24, #64]\n" "fmla v19.4s, v20.4s, v9.4s\n" "prfm pldl1keep, [x24, x8]\n" "fmla v23.4s, v20.4s, v12.4s\n" "fmla v25.4s, v20.4s, v10.4s\n" "fmla v24.4s, v20.4s, v17.4s\n" "ldr s28, [x27, x23]\n" "fmla v1.4s, v26.4s, v8.4s\n" "ldr s20, [x10, x12]\n" "fmla v19.4s, v26.4s, v11.4s\n" "fmla v22.4s, v26.4s, v9.4s\n" "fmla v23.4s, v26.4s, v13.4s\n" "fmla v25.4s, v26.4s, v12.4s\n" "fmla v24.4s, v26.4s, v14.4s\n" "ldr s17, [x26, x14]\n" "fmla v4.4s, v21.4s, v8.4s\n" "ldr s26, [x27, x12]\n" "fmla v22.4s, v21.4s, v11.4s\n" "add x26, x26, #4\n" "fmla v25.4s, v21.4s, v13.4s\n" "ldr s27, [x10, x14]\n" "fmla v18.4s, v28.4s, v8.4s\n" "prfm pldl1keep, [x26, #64]\n" "fmla v23.4s, v28.4s, v9.4s\n" "add x10, x10, #4\n" "fmla v24.4s, v28.4s, v10.4s\n" "ldr s28, [x27, x14]\n" "fmla v19.4s, v20.4s, v8.4s\n" "ldr s21, [%[wbptr]]\n" "fmla v23.4s, v20.4s, v11.4s\n" "add x27, x27, #4\n" "fmla v25.4s, v20.4s, v9.4s\n" "fmla v24.4s, v20.4s, v12.4s\n" "fmla v22.4s, v17.4s, v8.4s\n" "ldr s20, [%[wbptr], #4]\n" "fmla v23.4s, v26.4s, v8.4s\n" "ldr s14, [%[wbptr], #8]\n" "fmla v24.4s, v17.4s, v13.4s\n" "movi v29.16b, #0\n" "fmla v25.4s, v17.4s, v11.4s\n" "ldr s17, [%[wbptr], #16]\n" "fmax v7.4s, v7.4s, v29.4s\n" "fmax v6.4s, v6.4s, v29.4s\n" "fmla v24.4s, v26.4s, v9.4s\n" "ldr s13, [%[wbptr], #12]\n" "str s7, [%[outptr0]]\n" "fmla v25.4s, v27.4s, v8.4s\n" "str s6, [%[outptr0], %[output_col_stride1]]\n" "fmax v5.4s, v5.4s, v29.4s\n" "fmla v24.4s, v27.4s, v11.4s\n" "ldr s12, [%[wbptr], #20]\n" "str s5, [%[outptr0], x19]\n" "fmax v4.4s, v4.4s, v29.4s\n" "fmax v3.4s, v3.4s, v29.4s\n" "ldr s10, [%[wbptr], #28]\n" "str s4, [%[outptr0], x20]\n" "fmla v24.4s, v28.4s, v8.4s\n" "str s3, [x16]\n" "fmax v2.4s, v2.4s, v29.4s\n" "fmax v1.4s, v1.4s, v29.4s\n" "ldr s11, [%[wbptr], #24]\n" "str s2, [x16, %[output_col_stride1]]\n" "fmax v22.4s, v22.4s, v29.4s\n" "str s1, [x16, x19]\n" "fmax v15.4s, v15.4s, v29.4s\n" "str s22, [x16, x20]\n" "fmax v16.4s, v16.4s, v29.4s\n" "str s15, [x17]\n" "fmax v19.4s, v19.4s, v29.4s\n" "str s16, [x17, %[output_col_stride1]]\n" "fmax v25.4s, v25.4s, v29.4s\n" "str s19, [x17, x19]\n" "fmax v0.4s, v0.4s, v29.4s\n" "str s25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" "str s0, [x18]\n" "fmax v23.4s, v23.4s, v29.4s\n" "str s18, [x18, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" "str s23, [x18, x19]\n" "mov v7.16b, v21.16b\n" "str s24, [x18, x20]\n" "mov v3.16b, v21.16b\n" "mov v6.16b, v21.16b\n" "ldr s9, [%[wbptr], #32]\n" "mov v15.16b, v21.16b\n" "ldr s8, [%[wbptr], #36]\n" "mov v2.16b, v21.16b\n" "ldr s22, [%[inptr0]]\n" "mov v5.16b, v21.16b\n" "ldr s19, [x9]\n" "mov v0.16b, v21.16b\n" "ldr s23, [%[inptr0], %[input_col_stride1]]\n" "mov v16.16b, v21.16b\n" "ldr s18, [x24]\n" "mov v1.16b, v21.16b\n" "ldr s27, [x9, %[input_col_stride1]]\n" "mov v4.16b, v21.16b\n" "ldr s28, [%[inptr0], x28]\n" "fmla v7.4s, v22.4s, v20.4s\n" "ldr s25, [x26]\n" "fmla v3.4s, v19.4s, v20.4s\n" "ldr s22, [x24, %[input_col_stride1]]\n" "fmla v6.4s, v23.4s, v20.4s\n" "add %[outptr0], %[outptr0], #4\n" "fmla v7.4s, v19.4s, v17.4s\n" "add x16, x16, #4\n" "fmla v3.4s, v18.4s, v17.4s\n" "add x17, x17, #4\n" "fmla v15.4s, v18.4s, v20.4s\n" "add x18, x18, #4\n" "fmla v7.4s, v23.4s, v14.4s\n" "fmla v3.4s, v27.4s, v14.4s\n" "fmla v7.4s, v18.4s, v10.4s\n" "fmla v7.4s, v27.4s, v12.4s\n" "bne 5b\n" "6:\n" "mov v18.16b, v21.16b\n" "ldr s23, [x9, x28]\n" "mov v19.16b, v21.16b\n" "prfm pldl1keep, [x9, x25]\n" "fmla v6.4s, v27.4s, v17.4s\n" "prfm pldl1keep, [%[inptr0], x11]\n" "fmla v2.4s, v27.4s, v20.4s\n" "ldr s24, [%[inptr0], x23]\n" "fmla v7.4s, v28.4s, v13.4s\n" "prfm pldl1keep, [x10, #64]\n" "fmla v6.4s, v28.4s, v14.4s\n" "prfm pldl1keep, [x26, x8]\n" "fmla v5.4s, v28.4s, v20.4s\n" "ldr s26, [x10]\n" "fmla v3.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x25]\n" "fmla v15.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x9, x11]\n" "fmla v0.4s, v25.4s, v20.4s\n" "ldr s25, [x26, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [%[inptr0], x13]\n" "fmla v3.4s, v22.4s, v12.4s\n" "prfm pldl1keep, [x27, #64]\n" "fmla v6.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [x10, x8]\n" "fmla v15.4s, v22.4s, v14.4s\n" "prfm pldl1keep, [x26, x25]\n" "fmla v2.4s, v22.4s, v17.4s\n" "prfm pldl1keep, [x24, x11]\n" "fmla v16.4s, v22.4s, v20.4s\n" "ldr s22, [x24, x28]\n" "fmla v7.4s, v23.4s, v11.4s\n" "prfm pldl1keep, [x9, x13]\n" "fmla v3.4s, v23.4s, v13.4s\n" "prfm pldl1keep, [%[inptr0], x15]\n" "fmla v6.4s, v23.4s, v12.4s\n" "prfm pldl1keep, [x27, x8]\n" "fmla v2.4s, v23.4s, v14.4s\n" "prfm pldl1keep, [x10, x25]\n" "fmla v5.4s, v23.4s, v17.4s\n" "prfm pldl1keep, [x26, x11]\n" "fmla v1.4s, v23.4s, v20.4s\n" "ldr s23, [x9, x23]\n" "fmla v6.4s, v24.4s, v13.4s\n" "prfm pldl1keep, [x24, x13]\n" "fmla v5.4s, v24.4s, v14.4s\n" "prfm pldl1keep, [x9, x15]\n" "fmla v4.4s, v24.4s, v20.4s\n" "ldr s24, [%[inptr0], x12]\n" "fmla v15.4s, v26.4s, v10.4s\n" "prfm pldl1keep, [x27, x25]\n" "fmla v0.4s, v26.4s, v17.4s\n" "ldr s29, [x27]\n" "fmla v3.4s, v25.4s, v9.4s\n" "prfm pldl1keep, [x10, x11]\n" "fmla v15.4s, v25.4s, v12.4s\n" "prfm pldl1keep, [x26, x13]\n" "fmla v2.4s, v25.4s, v10.4s\n" "prfm pldl1keep, [x24, x15]\n" "fmla v0.4s, v25.4s, v14.4s\n" "prfm pldl1keep, [x27, x11]\n" "fmla v16.4s, v25.4s, v17.4s\n" "prfm pldl1keep, [x10, x13]\n" "fmla v18.4s, v25.4s, v20.4s\n" "ldr s26, [x10, %[input_col_stride1]]\n" "fmla v7.4s, v22.4s, v8.4s\n" "prfm pldl1keep, [x26, x15]\n" "fmla v3.4s, v22.4s, v11.4s\n" "prfm pldl1keep, [x27, x13]\n" "fmla v6.4s, v22.4s, v9.4s\n" "prfm pldl1keep, [x10, x15]\n" "fmla v15.4s, v22.4s, v13.4s\n" "prfm pldl1keep, [x27, x15]\n" "fmla v2.4s, v22.4s, v12.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v5.4s, v22.4s, v10.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v16.4s, v22.4s, v14.4s\n" "fmla v1.4s, v22.4s, v17.4s\n" "fmla v19.4s, v22.4s, v20.4s\n" "ldr s27, [x26, x28]\n" "fmla v6.4s, v23.4s, v11.4s\n" "fmla v2.4s, v23.4s, v13.4s\n" "fmla v5.4s, v23.4s, v12.4s\n" "fmla v1.4s, v23.4s, v14.4s\n" "fmla v4.4s, v23.4s, v17.4s\n" "fmla v0.4s, v29.4s, v10.4s\n" "mov v22.16b, v21.16b\n" "fmla v15.4s, v26.4s, v9.4s\n" "fmla v5.4s, v24.4s, v13.4s\n" "fmla v16.4s, v26.4s, v10.4s\n" "fmla v22.4s, v23.4s, v20.4s\n" "ldr s29, [x24, x23]\n" "fmla v4.4s, v24.4s, v14.4s\n" "ldr s28, [x9, x12]\n" "fmla v0.4s, v26.4s, v12.4s\n" "fmla v18.4s, v26.4s, v17.4s\n" "mov v23.16b, v21.16b\n" "fmla v3.4s, v27.4s, v8.4s\n" "fmla v15.4s, v27.4s, v11.4s\n" "fmla v2.4s, v27.4s, v9.4s\n" "fmla v0.4s, v27.4s, v13.4s\n" "fmla v16.4s, v27.4s, v12.4s\n" "fmla v1.4s, v27.4s, v10.4s\n" "fmla v18.4s, v27.4s, v14.4s\n" "fmla v19.4s, v27.4s, v17.4s\n" "fmla v23.4s, v27.4s, v20.4s\n" "mov v25.16b, v21.16b\n" "mov v24.16b, v21.16b\n" "fmla v6.4s, v29.4s, v8.4s\n" "fmla v2.4s, v29.4s, v11.4s\n" "fmla v5.4s, v29.4s, v9.4s\n" "fmla v16.4s, v29.4s, v13.4s\n" "fmla v1.4s, v29.4s, v12.4s\n" "fmla v4.4s, v29.4s, v10.4s\n" "fmla v19.4s, v29.4s, v14.4s\n" "fmla v22.4s, v29.4s, v17.4s\n" "fmla v25.4s, v29.4s, v20.4s\n" "ldr s21, [%[inptr0], x14]\n" "fmla v5.4s, v28.4s, v11.4s\n" "add %[inptr0], %[inptr0], #4\n" "fmla v1.4s, v28.4s, v13.4s\n" "fmla v4.4s, v28.4s, v12.4s\n" "fmla v22.4s, v28.4s, v14.4s\n" "ldr s26, [x27, %[input_col_stride1]]\n" "fmla v0.4s, v26.4s, v9.4s\n" "fmla v18.4s, v26.4s, v10.4s\n" "fmla v4.4s, v21.4s, v13.4s\n" "ldr s21, [x10, x28]\n" "fmla v15.4s, v21.4s, v8.4s\n" "ldr s29, [x26, x23]\n" "fmla v0.4s, v21.4s, v11.4s\n" "fmla v16.4s, v21.4s, v9.4s\n" "fmla v18.4s, v21.4s, v12.4s\n" "fmla v19.4s, v21.4s, v10.4s\n" "fmla v23.4s, v21.4s, v17.4s\n" "ldr s21, [x24, x12]\n" "fmla v2.4s, v29.4s, v8.4s\n" "fmla v16.4s, v29.4s, v11.4s\n" "fmla v1.4s, v29.4s, v9.4s\n" "fmla v18.4s, v29.4s, v13.4s\n" "fmla v19.4s, v29.4s, v12.4s\n" "fmla v22.4s, v29.4s, v10.4s\n" "fmla v23.4s, v29.4s, v14.4s\n" "fmla v25.4s, v29.4s, v17.4s\n" "fmla v24.4s, v29.4s, v20.4s\n" "ldr s28, [x9, x14]\n" "fmla v5.4s, v21.4s, v8.4s\n" "ldr s27, [x27, x28]\n" "fmla v1.4s, v21.4s, v11.4s\n" "add x9, x9, #4\n" "fmla v4.4s, v21.4s, v9.4s\n" "fmla v19.4s, v21.4s, v13.4s\n" "fmla v22.4s, v21.4s, v12.4s\n" "fmla v25.4s, v21.4s, v14.4s\n" "fmla v0.4s, v27.4s, v8.4s\n" "ldr s20, [x10, x23]\n" "fmla v4.4s, v28.4s, v11.4s\n" "fmla v18.4s, v27.4s, v9.4s\n" "fmla v22.4s, v28.4s, v13.4s\n" "ldr s26, [x26, x12]\n" "fmla v23.4s, v27.4s, v10.4s\n" "ldr s21, [x24, x14]\n" "fmla v16.4s, v20.4s, v8.4s\n" "add x24, x24, #4\n" "fmla v18.4s, v20.4s, v11.4s\n" "fmla v19.4s, v20.4s, v9.4s\n" "fmla v23.4s, v20.4s, v12.4s\n" "fmla v25.4s, v20.4s, v10.4s\n" "fmla v24.4s, v20.4s, v17.4s\n" "ldr s28, [x27, x23]\n" "fmla v1.4s, v26.4s, v8.4s\n" "ldr s20, [x10, x12]\n" "fmla v19.4s, v26.4s, v11.4s\n" "fmla v22.4s, v26.4s, v9.4s\n" "fmla v23.4s, v26.4s, v13.4s\n" "fmla v25.4s, v26.4s, v12.4s\n" "fmla v24.4s, v26.4s, v14.4s\n" "ldr s17, [x26, x14]\n" "fmla v4.4s, v21.4s, v8.4s\n" "ldr s26, [x27, x12]\n" "fmla v22.4s, v21.4s, v11.4s\n" "add x26, x26, #4\n" "fmla v25.4s, v21.4s, v13.4s\n" "ldr s27, [x10, x14]\n" "fmla v18.4s, v28.4s, v8.4s\n" "add x10, x10, #4\n" "fmla v23.4s, v28.4s, v9.4s\n" "fmla v24.4s, v28.4s, v10.4s\n" "fmla v19.4s, v20.4s, v8.4s\n" "ldr s28, [x27, x14]\n" "fmla v25.4s, v20.4s, v9.4s\n" "add x27, x27, #4\n" "fmla v23.4s, v20.4s, v11.4s\n" "fmla v24.4s, v20.4s, v12.4s\n" "fmla v22.4s, v17.4s, v8.4s\n" "movi v29.16b, #0\n" "fmla v25.4s, v17.4s, v11.4s\n" "fmla v24.4s, v17.4s, v13.4s\n" "fmla v23.4s, v26.4s, v8.4s\n" "fmax v7.4s, v7.4s, v29.4s\n" "fmla v25.4s, v27.4s, v8.4s\n" "fmax v6.4s, v6.4s, v29.4s\n" "str s7, [%[outptr0]]\n" "fmla v24.4s, v26.4s, v9.4s\n" "str s6, [%[outptr0], %[output_col_stride1]]\n" "fmax v5.4s, v5.4s, v29.4s\n" "fmax v4.4s, v4.4s, v29.4s\n" "fmax v3.4s, v3.4s, v29.4s\n" "str s5, [%[outptr0], x19]\n" "fmla v24.4s, v27.4s, v11.4s\n" "str s4, [%[outptr0], x20]\n" "fmax v2.4s, v2.4s, v29.4s\n" "str s3, [x16]\n" "fmax v1.4s, v1.4s, v29.4s\n" "str s2, [x16, %[output_col_stride1]]\n" "fmla v24.4s, v28.4s, v8.4s\n" "str s1, [x16, x19]\n" "fmax v22.4s, v22.4s, v29.4s\n" "fmax v15.4s, v15.4s, v29.4s\n" "add %[outptr0], %[outptr0], #4\n" "str s22, [x16, x20]\n" "fmax v16.4s, v16.4s, v29.4s\n" "str s15, [x17]\n" "fmax v19.4s, v19.4s, v29.4s\n" "str s16, [x17, %[output_col_stride1]]\n" "fmax v25.4s, v25.4s, v29.4s\n" "str s19, [x17, x19]\n" "fmax v0.4s, v0.4s, v29.4s\n" "str s25, [x17, x20]\n" "fmax v18.4s, v18.4s, v29.4s\n" "str s0, [x18]\n" "fmax v23.4s, v23.4s, v29.4s\n" "str s18, [x18, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v29.4s\n" "str s23, [x18, x19]\n" "add x16, x16, #4\n" "str s24, [x18, x20]\n" "add x17, x17, #4\n" "add x18, x18, #4\n" "7:\n" : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)) : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" ); } template <> template <> void Conv::execute_tile( int n_channels, const void *weight_bias_ptr, const float *input, const unsigned int input_row_stride, const unsigned int input_col_stride, float *output, const unsigned int output_row_stride, const unsigned int output_col_stride ) { __asm __volatile( "add x24, %[inptr0], %[input_row_stride]\n" "add x13, %[input_col_stride1], %[input_col_stride1]\n" "add x8, %[outptr0], %[output_row_stride]\n" "add x9, x24, %[input_row_stride]\n" "add x10, x13, #64\n" "add x19, x13, %[input_col_stride1]\n" "add x20, x9, %[input_row_stride]\n" "add x21, x19, #64\n" "add x17, x19, %[input_col_stride1]\n" "add x22, x20, %[input_row_stride]\n" "add x18, x17, #64\n" "add x11, x17, %[input_col_stride1]\n" "add x23, x22, %[input_row_stride]\n" "add x12, x11, #64\n" "add x25, x8, %[output_row_stride]\n" "add x26, x25, %[output_row_stride]\n" "add x27, %[output_col_stride1], %[output_col_stride1]\n" "and x14, %[n_channels], #3\n" "add x28, x27, %[output_col_stride1]\n" "lsr x15, %[n_channels], #2\n" "cbz x15, 4f\n" "1:\n" "ldr q23, [%[wbptr]]\n" "subs x15, x15, #1\n" "mov v12.16b, v23.16b\n" "ldr q20, [%[wbptr], #16]\n" "mov v8.16b, v23.16b\n" "ldr q6, [%[wbptr], #32]\n" "mov v11.16b, v23.16b\n" "ldr q5, [%[wbptr], #48]\n" "mov v16.16b, v23.16b\n" "ldr q19, [%[wbptr], #64]\n" "mov v7.16b, v23.16b\n" "ldr q4, [%[wbptr], #80]\n" "mov v10.16b, v23.16b\n" "ldr q3, [%[wbptr], #96]\n" "mov v14.16b, v23.16b\n" "ldr q2, [%[wbptr], #112]\n" "mov v15.16b, v23.16b\n" "ldr q1, [%[wbptr], #128]\n" "mov v17.16b, v23.16b\n" "ldr q0, [%[wbptr], #144]\n" "mov v9.16b, v23.16b\n" "ldr q28, [%[inptr0]]\n" "fmla v12.4s, v28.4s, v20.4s\n" "ldr q25, [x24]\n" "fmla v8.4s, v25.4s, v20.4s\n" "ldr q18, [%[inptr0], %[input_col_stride1]]\n" "fmla v11.4s, v18.4s, v20.4s\n" "ldr q30, [x9]\n" "fmla v12.4s, v25.4s, v19.4s\n" "ldr q29, [x24, %[input_col_stride1]]\n" "fmla v8.4s, v30.4s, v19.4s\n" "ldr q24, [%[inptr0], x13]\n" "fmla v16.4s, v30.4s, v20.4s\n" "ldr q27, [x20]\n" "fmla v12.4s, v18.4s, v6.4s\n" "ldr q22, [x9, %[input_col_stride1]]\n" "fmla v8.4s, v29.4s, v6.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "prfm pldl1keep, [x24, #64]\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v12.4s, v30.4s, v2.4s\n" "prfm pldl1keep, [x9, #64]\n" "prfm pldl1keep, [x24, x16]\n" "prfm pldl1keep, [%[inptr0], x10]\n" "prfm pldl1keep, [x20, #64]\n" "prfm pldl1keep, [x9, x16]\n" "fmla v12.4s, v29.4s, v4.4s\n" "beq 3f\n" "2:\n" "mov v13.16b, v23.16b\n" "ldr q21, [x24, x13]\n" "mov v18.16b, v23.16b\n" "prfm pldl1keep, [x24, x10]\n" "fmla v11.4s, v29.4s, v19.4s\n" "prfm pldl1keep, [%[inptr0], x21]\n" "fmla v7.4s, v29.4s, v20.4s\n" "ldr q25, [%[inptr0], x19]\n" "fmla v12.4s, v24.4s, v5.4s\n" "prfm pldl1keep, [x22, #64]\n" "fmla v11.4s, v24.4s, v6.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v10.4s, v24.4s, v20.4s\n" "ldr q24, [x22]\n" "fmla v8.4s, v27.4s, v2.4s\n" "prfm pldl1keep, [x9, x10]\n" "fmla v16.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, x21]\n" "fmla v14.4s, v27.4s, v20.4s\n" "ldr q26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v16.4s, v22.4s, v6.4s\n" "prfm pldl1keep, [x20, x10]\n" "fmla v7.4s, v22.4s, v19.4s\n" "prfm pldl1keep, [x9, x21]\n" "fmla v15.4s, v22.4s, v20.4s\n" "ldr q30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" "prfm pldl1keep, [x24, x18]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" "prfm pldl1keep, [x23, x16]\n" "fmla v7.4s, v21.4s, v6.4s\n" "prfm pldl1keep, [x22, x10]\n" "fmla v10.4s, v21.4s, v19.4s\n" "prfm pldl1keep, [x20, x21]\n" "fmla v17.4s, v21.4s, v20.4s\n" "ldr q22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" "ldr q21, [%[inptr0], x17]\n" "fmla v16.4s, v24.4s, v2.4s\n" "prfm pldl1keep, [x23, x10]\n" "fmla v14.4s, v24.4s, v19.4s\n" "ldr q24, [x23]\n" "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" "prfm pldl1keep, [x20, x18]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" "prfm pldl1keep, [x22, x18]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr q26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x23, x18]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" "prfm pldl1keep, [x23, x12]\n" "fmla v7.4s, v30.4s, v4.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v10.4s, v30.4s, v2.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v15.4s, v30.4s, v6.4s\n" "subs x15, x15, #1\n" "fmla v17.4s, v30.4s, v19.4s\n" "fmla v18.4s, v30.4s, v20.4s\n" "mov v25.16b, v23.16b\n" "fmla v11.4s, v22.4s, v3.4s\n" "fmla v7.4s, v22.4s, v5.4s\n" "fmla v10.4s, v22.4s, v4.4s\n" "fmla v17.4s, v22.4s, v6.4s\n" "fmla v9.4s, v22.4s, v19.4s\n" "fmla v25.4s, v22.4s, v20.4s\n" "ldr q27, [x20, x13]\n" "fmla v10.4s, v21.4s, v5.4s\n" "fmla v14.4s, v24.4s, v2.4s\n" "mov v22.16b, v23.16b\n" "fmla v9.4s, v21.4s, v6.4s\n" "mov v24.16b, v23.16b\n" "mov v21.16b, v23.16b\n" "fmla v16.4s, v26.4s, v1.4s\n" "fmla v14.4s, v26.4s, v4.4s\n" "fmla v15.4s, v26.4s, v2.4s\n" "fmla v13.4s, v26.4s, v19.4s\n" "fmla v8.4s, v27.4s, v0.4s\n" "ldr q28, [x9, x19]\n" "fmla v16.4s, v27.4s, v3.4s\n" "fmla v7.4s, v27.4s, v1.4s\n" "fmla v14.4s, v27.4s, v5.4s\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v17.4s, v27.4s, v2.4s\n" "fmla v13.4s, v27.4s, v6.4s\n" "fmla v18.4s, v27.4s, v19.4s\n" "fmla v22.4s, v27.4s, v20.4s\n" "fmla v11.4s, v28.4s, v0.4s\n" "ldr q29, [x24, x17]\n" "fmla v7.4s, v28.4s, v3.4s\n" "fmla v10.4s, v28.4s, v1.4s\n" "fmla v15.4s, v28.4s, v5.4s\n" "fmla v17.4s, v28.4s, v4.4s\n" "fmla v9.4s, v28.4s, v2.4s\n" "fmla v18.4s, v28.4s, v6.4s\n" "fmla v25.4s, v28.4s, v19.4s\n" "fmla v24.4s, v28.4s, v20.4s\n" "fmla v10.4s, v29.4s, v3.4s\n" "ldr q23, [%[inptr0], x11]\n" "fmla v17.4s, v29.4s, v5.4s\n" "add %[inptr0], %[inptr0], #16\n" "fmla v9.4s, v29.4s, v4.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "fmla v25.4s, v29.4s, v6.4s\n" "ldr q30, [x23, %[input_col_stride1]]\n" "fmla v14.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v9.4s, v23.4s, v5.4s\n" "ldr q23, [x22, x13]\n" "fmla v13.4s, v30.4s, v2.4s\n" "ldr q29, [x20, x19]\n" "fmla v16.4s, v23.4s, v0.4s\n" "prfm pldl1keep, [%[inptr0], x10]\n" "fmla v14.4s, v23.4s, v3.4s\n" "fmla v15.4s, v23.4s, v1.4s\n" "fmla v13.4s, v23.4s, v4.4s\n" "fmla v18.4s, v23.4s, v2.4s\n" "fmla v22.4s, v23.4s, v19.4s\n" "ldr q23, [x9, x17]\n" "fmla v7.4s, v29.4s, v0.4s\n" "fmla v15.4s, v29.4s, v3.4s\n" "fmla v17.4s, v29.4s, v1.4s\n" "fmla v13.4s, v29.4s, v5.4s\n" "fmla v18.4s, v29.4s, v4.4s\n" "fmla v25.4s, v29.4s, v2.4s\n" "fmla v22.4s, v29.4s, v6.4s\n" "fmla v24.4s, v29.4s, v19.4s\n" "fmla v21.4s, v29.4s, v20.4s\n" "ldr q26, [x24, x11]\n" "fmla v10.4s, v23.4s, v0.4s\n" "ldr q28, [x23, x13]\n" "fmla v17.4s, v23.4s, v3.4s\n" "add x24, x24, #16\n" "fmla v9.4s, v23.4s, v1.4s\n" "prfm pldl1keep, [x24, #64]\n" "fmla v18.4s, v23.4s, v5.4s\n" "prfm pldl1keep, [x24, x16]\n" "fmla v25.4s, v23.4s, v4.4s\n" "fmla v24.4s, v23.4s, v6.4s\n" "fmla v9.4s, v26.4s, v3.4s\n" "ldr q20, [x22, x19]\n" "fmla v14.4s, v28.4s, v0.4s\n" "fmla v13.4s, v28.4s, v1.4s\n" "fmla v25.4s, v26.4s, v5.4s\n" "ldr q26, [x20, x17]\n" "fmla v22.4s, v28.4s, v2.4s\n" "ldr q23, [x9, x11]\n" "fmla v15.4s, v20.4s, v0.4s\n" "add x9, x9, #16\n" "fmla v13.4s, v20.4s, v3.4s\n" "prfm pldl1keep, [x9, #64]\n" "fmla v18.4s, v20.4s, v1.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v22.4s, v20.4s, v4.4s\n" "fmla v24.4s, v20.4s, v2.4s\n" "fmla v21.4s, v20.4s, v19.4s\n" "ldr q27, [x23, x19]\n" "fmla v17.4s, v26.4s, v0.4s\n" "ldr q20, [x22, x17]\n" "fmla v18.4s, v26.4s, v3.4s\n" "fmla v25.4s, v26.4s, v1.4s\n" "fmla v22.4s, v26.4s, v5.4s\n" "fmla v24.4s, v26.4s, v4.4s\n" "fmla v21.4s, v26.4s, v6.4s\n" "ldr q19, [x20, x11]\n" "fmla v9.4s, v23.4s, v0.4s\n" "ldr q28, [x23, x17]\n" "fmla v25.4s, v23.4s, v3.4s\n" "add x20, x20, #16\n" "fmla v24.4s, v23.4s, v5.4s\n" "ldr q29, [x22, x11]\n" "fmla v13.4s, v27.4s, v0.4s\n" "prfm pldl1keep, [x20, #64]\n" "fmla v22.4s, v27.4s, v1.4s\n" "add x22, x22, #16\n" "fmla v21.4s, v27.4s, v2.4s\n" "ldr q30, [x23, x11]\n" "fmla v18.4s, v20.4s, v0.4s\n" "ldr q23, [%[wbptr]]\n" "fmla v22.4s, v20.4s, v3.4s\n" "add x23, x23, #16\n" "fmla v24.4s, v20.4s, v1.4s\n" "fmla v21.4s, v20.4s, v4.4s\n" "fmla v25.4s, v19.4s, v0.4s\n" "ldr q20, [%[wbptr], #16]\n" "fmla v22.4s, v28.4s, v0.4s\n" "ldr q6, [%[wbptr], #32]\n" "fmla v21.4s, v19.4s, v5.4s\n" "movi v26.16b, #0\n" "fmla v24.4s, v19.4s, v3.4s\n" "ldr q19, [%[wbptr], #64]\n" "fmax v12.4s, v12.4s, v26.4s\n" "fmax v11.4s, v11.4s, v26.4s\n" "fmla v21.4s, v28.4s, v1.4s\n" "ldr q5, [%[wbptr], #48]\n" "fmla v24.4s, v29.4s, v0.4s\n" "ldr q4, [%[wbptr], #80]\n" "fmax v10.4s, v10.4s, v26.4s\n" "fmax v9.4s, v9.4s, v26.4s\n" "fmla v21.4s, v29.4s, v3.4s\n" "ldr q2, [%[wbptr], #112]\n" "fmov v27.4s, #6.0\n" "fmax v8.4s, v8.4s, v26.4s\n" "fmax v7.4s, v7.4s, v26.4s\n" "fmax v17.4s, v17.4s, v26.4s\n" "fmla v21.4s, v30.4s, v0.4s\n" "ldr q3, [%[wbptr], #96]\n" "fmin v12.4s, v12.4s, v27.4s\n" "ldr q1, [%[wbptr], #128]\n" "fmin v11.4s, v11.4s, v27.4s\n" "fmin v10.4s, v10.4s, v27.4s\n" "str q12, [%[outptr0]]\n" "fmin v9.4s, v9.4s, v27.4s\n" "str q11, [%[outptr0], %[output_col_stride1]]\n" "fmin v8.4s, v8.4s, v27.4s\n" "str q10, [%[outptr0], x27]\n" "fmin v7.4s, v7.4s, v27.4s\n" "str q9, [%[outptr0], x28]\n" "fmin v17.4s, v17.4s, v27.4s\n" "str q8, [x8]\n" "fmax v25.4s, v25.4s, v26.4s\n" "str q7, [x8, %[output_col_stride1]]\n" "fmax v16.4s, v16.4s, v26.4s\n" "str q17, [x8, x27]\n" "fmin v25.4s, v25.4s, v27.4s\n" "fmin v16.4s, v16.4s, v27.4s\n" "ldr q0, [%[wbptr], #144]\n" "str q25, [x8, x28]\n" "fmax v15.4s, v15.4s, v26.4s\n" "str q16, [x25]\n" "fmax v18.4s, v18.4s, v26.4s\n" "fmin v15.4s, v15.4s, v27.4s\n" "ldr q28, [%[inptr0]]\n" "fmin v18.4s, v18.4s, v27.4s\n" "ldr q25, [x24]\n" "str q15, [x25, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v26.4s\n" "str q18, [x25, x27]\n" "fmax v14.4s, v14.4s, v26.4s\n" "fmin v24.4s, v24.4s, v27.4s\n" "ldr q18, [%[inptr0], %[input_col_stride1]]\n" "fmin v14.4s, v14.4s, v27.4s\n" "ldr q30, [x9]\n" "str q24, [x25, x28]\n" "fmax v13.4s, v13.4s, v26.4s\n" "str q14, [x26]\n" "fmax v22.4s, v22.4s, v26.4s\n" "fmin v13.4s, v13.4s, v27.4s\n" "ldr q29, [x24, %[input_col_stride1]]\n" "fmin v22.4s, v22.4s, v27.4s\n" "ldr q24, [%[inptr0], x13]\n" "str q13, [x26, %[output_col_stride1]]\n" "fmax v21.4s, v21.4s, v26.4s\n" "str q22, [x26, x27]\n" "mov v12.16b, v23.16b\n" "fmin v21.4s, v21.4s, v27.4s\n" "ldr q27, [x20]\n" "mov v8.16b, v23.16b\n" "ldr q22, [x9, %[input_col_stride1]]\n" "str q21, [x26, x28]\n" "mov v11.16b, v23.16b\n" "mov v16.16b, v23.16b\n" "add %[outptr0], %[outptr0], #16\n" "mov v7.16b, v23.16b\n" "add x8, x8, #16\n" "mov v10.16b, v23.16b\n" "add x25, x25, #16\n" "mov v14.16b, v23.16b\n" "add x26, x26, #16\n" "mov v15.16b, v23.16b\n" "mov v17.16b, v23.16b\n" "mov v9.16b, v23.16b\n" "fmla v12.4s, v28.4s, v20.4s\n" "fmla v8.4s, v25.4s, v20.4s\n" "fmla v11.4s, v18.4s, v20.4s\n" "fmla v16.4s, v30.4s, v20.4s\n" "fmla v12.4s, v25.4s, v19.4s\n" "fmla v8.4s, v30.4s, v19.4s\n" "fmla v12.4s, v18.4s, v6.4s\n" "fmla v8.4s, v29.4s, v6.4s\n" "fmla v12.4s, v30.4s, v2.4s\n" "fmla v12.4s, v29.4s, v4.4s\n" "bne 2b\n" "3:\n" "mov v13.16b, v23.16b\n" "ldr q21, [x24, x13]\n" "mov v18.16b, v23.16b\n" "prfm pldl1keep, [x24, x10]\n" "fmla v11.4s, v29.4s, v19.4s\n" "prfm pldl1keep, [%[inptr0], x21]\n" "fmla v7.4s, v29.4s, v20.4s\n" "ldr q25, [%[inptr0], x19]\n" "fmla v12.4s, v24.4s, v5.4s\n" "prfm pldl1keep, [x22, #64]\n" "fmla v11.4s, v24.4s, v6.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v10.4s, v24.4s, v20.4s\n" "ldr q24, [x22]\n" "fmla v8.4s, v27.4s, v2.4s\n" "prfm pldl1keep, [x9, x10]\n" "fmla v16.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, x21]\n" "fmla v14.4s, v27.4s, v20.4s\n" "ldr q26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v16.4s, v22.4s, v6.4s\n" "prfm pldl1keep, [x20, x10]\n" "fmla v7.4s, v22.4s, v19.4s\n" "prfm pldl1keep, [x9, x21]\n" "fmla v15.4s, v22.4s, v20.4s\n" "ldr q30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" "prfm pldl1keep, [x24, x18]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" "prfm pldl1keep, [x23, x16]\n" "fmla v7.4s, v21.4s, v6.4s\n" "prfm pldl1keep, [x22, x10]\n" "fmla v10.4s, v21.4s, v19.4s\n" "prfm pldl1keep, [x20, x21]\n" "fmla v17.4s, v21.4s, v20.4s\n" "ldr q22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" "ldr q21, [%[inptr0], x17]\n" "fmla v16.4s, v24.4s, v2.4s\n" "prfm pldl1keep, [x23, x10]\n" "fmla v14.4s, v24.4s, v19.4s\n" "ldr q24, [x23]\n" "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" "prfm pldl1keep, [x20, x18]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" "prfm pldl1keep, [x22, x18]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr q26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x23, x18]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" "prfm pldl1keep, [x23, x12]\n" "fmla v7.4s, v30.4s, v4.4s\n" "add %[wbptr], %[wbptr], #160\n" "fmla v10.4s, v30.4s, v2.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v15.4s, v30.4s, v6.4s\n" "fmla v17.4s, v30.4s, v19.4s\n" "fmla v18.4s, v30.4s, v20.4s\n" "ldr q27, [x20, x13]\n" "fmla v11.4s, v22.4s, v3.4s\n" "fmla v7.4s, v22.4s, v5.4s\n" "fmla v10.4s, v22.4s, v4.4s\n" "fmla v17.4s, v22.4s, v6.4s\n" "fmla v9.4s, v22.4s, v19.4s\n" "fmla v14.4s, v24.4s, v2.4s\n" "mov v25.16b, v23.16b\n" "fmla v16.4s, v26.4s, v1.4s\n" "fmla v10.4s, v21.4s, v5.4s\n" "fmla v15.4s, v26.4s, v2.4s\n" "fmla v25.4s, v22.4s, v20.4s\n" "ldr q28, [x9, x19]\n" "fmla v9.4s, v21.4s, v6.4s\n" "ldr q29, [x24, x17]\n" "fmla v14.4s, v26.4s, v4.4s\n" "fmla v13.4s, v26.4s, v19.4s\n" "mov v22.16b, v23.16b\n" "fmla v8.4s, v27.4s, v0.4s\n" "fmla v16.4s, v27.4s, v3.4s\n" "fmla v7.4s, v27.4s, v1.4s\n" "fmla v14.4s, v27.4s, v5.4s\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v17.4s, v27.4s, v2.4s\n" "fmla v13.4s, v27.4s, v6.4s\n" "fmla v18.4s, v27.4s, v19.4s\n" "fmla v22.4s, v27.4s, v20.4s\n" "mov v24.16b, v23.16b\n" "mov v21.16b, v23.16b\n" "fmla v11.4s, v28.4s, v0.4s\n" "fmla v7.4s, v28.4s, v3.4s\n" "fmla v10.4s, v28.4s, v1.4s\n" "fmla v15.4s, v28.4s, v5.4s\n" "fmla v17.4s, v28.4s, v4.4s\n" "fmla v9.4s, v28.4s, v2.4s\n" "fmla v18.4s, v28.4s, v6.4s\n" "fmla v25.4s, v28.4s, v19.4s\n" "fmla v24.4s, v28.4s, v20.4s\n" "ldr q23, [%[inptr0], x11]\n" "fmla v10.4s, v29.4s, v3.4s\n" "add %[inptr0], %[inptr0], #16\n" "fmla v17.4s, v29.4s, v5.4s\n" "fmla v9.4s, v29.4s, v4.4s\n" "fmla v25.4s, v29.4s, v6.4s\n" "ldr q30, [x23, %[input_col_stride1]]\n" "fmla v14.4s, v30.4s, v1.4s\n" "fmla v13.4s, v30.4s, v2.4s\n" "fmla v9.4s, v23.4s, v5.4s\n" "ldr q23, [x22, x13]\n" "fmla v16.4s, v23.4s, v0.4s\n" "ldr q29, [x20, x19]\n" "fmla v14.4s, v23.4s, v3.4s\n" "fmla v15.4s, v23.4s, v1.4s\n" "fmla v13.4s, v23.4s, v4.4s\n" "fmla v18.4s, v23.4s, v2.4s\n" "fmla v22.4s, v23.4s, v19.4s\n" "ldr q23, [x9, x17]\n" "fmla v7.4s, v29.4s, v0.4s\n" "fmla v15.4s, v29.4s, v3.4s\n" "fmla v17.4s, v29.4s, v1.4s\n" "fmla v13.4s, v29.4s, v5.4s\n" "fmla v18.4s, v29.4s, v4.4s\n" "fmla v25.4s, v29.4s, v2.4s\n" "fmla v22.4s, v29.4s, v6.4s\n" "fmla v24.4s, v29.4s, v19.4s\n" "fmla v21.4s, v29.4s, v20.4s\n" "ldr q26, [x24, x11]\n" "fmla v10.4s, v23.4s, v0.4s\n" "ldr q28, [x23, x13]\n" "fmla v17.4s, v23.4s, v3.4s\n" "add x24, x24, #16\n" "fmla v9.4s, v23.4s, v1.4s\n" "fmla v18.4s, v23.4s, v5.4s\n" "fmla v25.4s, v23.4s, v4.4s\n" "fmla v24.4s, v23.4s, v6.4s\n" "fmla v14.4s, v28.4s, v0.4s\n" "ldr q20, [x22, x19]\n" "fmla v9.4s, v26.4s, v3.4s\n" "fmla v13.4s, v28.4s, v1.4s\n" "fmla v25.4s, v26.4s, v5.4s\n" "ldr q26, [x20, x17]\n" "fmla v22.4s, v28.4s, v2.4s\n" "ldr q23, [x9, x11]\n" "fmla v15.4s, v20.4s, v0.4s\n" "add x9, x9, #16\n" "fmla v13.4s, v20.4s, v3.4s\n" "fmla v18.4s, v20.4s, v1.4s\n" "fmla v22.4s, v20.4s, v4.4s\n" "fmla v24.4s, v20.4s, v2.4s\n" "fmla v21.4s, v20.4s, v19.4s\n" "ldr q27, [x23, x19]\n" "fmla v17.4s, v26.4s, v0.4s\n" "ldr q20, [x22, x17]\n" "fmla v18.4s, v26.4s, v3.4s\n" "fmla v25.4s, v26.4s, v1.4s\n" "fmla v22.4s, v26.4s, v5.4s\n" "fmla v24.4s, v26.4s, v4.4s\n" "fmla v21.4s, v26.4s, v6.4s\n" "ldr q19, [x20, x11]\n" "fmla v9.4s, v23.4s, v0.4s\n" "ldr q28, [x23, x17]\n" "fmla v25.4s, v23.4s, v3.4s\n" "add x20, x20, #16\n" "fmla v24.4s, v23.4s, v5.4s\n" "ldr q29, [x22, x11]\n" "fmla v13.4s, v27.4s, v0.4s\n" "add x22, x22, #16\n" "fmla v22.4s, v27.4s, v1.4s\n" "fmla v21.4s, v27.4s, v2.4s\n" "fmla v18.4s, v20.4s, v0.4s\n" "ldr q30, [x23, x11]\n" "fmla v24.4s, v20.4s, v1.4s\n" "add x23, x23, #16\n" "fmla v22.4s, v20.4s, v3.4s\n" "fmla v21.4s, v20.4s, v4.4s\n" "fmla v25.4s, v19.4s, v0.4s\n" "movi v26.16b, #0\n" "fmla v24.4s, v19.4s, v3.4s\n" "fmov v27.4s, #6.0\n" "fmla v21.4s, v19.4s, v5.4s\n" "fmla v22.4s, v28.4s, v0.4s\n" "fmax v12.4s, v12.4s, v26.4s\n" "fmax v11.4s, v11.4s, v26.4s\n" "fmla v24.4s, v29.4s, v0.4s\n" "fmax v10.4s, v10.4s, v26.4s\n" "fmla v21.4s, v28.4s, v1.4s\n" "fmin v12.4s, v12.4s, v27.4s\n" "fmin v11.4s, v11.4s, v27.4s\n" "fmin v10.4s, v10.4s, v27.4s\n" "str q12, [%[outptr0]]\n" "fmax v9.4s, v9.4s, v26.4s\n" "str q11, [%[outptr0], %[output_col_stride1]]\n" "fmla v21.4s, v29.4s, v3.4s\n" "str q10, [%[outptr0], x27]\n" "fmin v9.4s, v9.4s, v27.4s\n" "fmax v8.4s, v8.4s, v26.4s\n" "fmax v7.4s, v7.4s, v26.4s\n" "str q9, [%[outptr0], x28]\n" "fmla v21.4s, v30.4s, v0.4s\n" "fmin v8.4s, v8.4s, v27.4s\n" "add %[outptr0], %[outptr0], #16\n" "fmin v7.4s, v7.4s, v27.4s\n" "fmax v17.4s, v17.4s, v26.4s\n" "str q8, [x8]\n" "fmax v25.4s, v25.4s, v26.4s\n" "str q7, [x8, %[output_col_stride1]]\n" "fmin v17.4s, v17.4s, v27.4s\n" "fmin v25.4s, v25.4s, v27.4s\n" "fmax v16.4s, v16.4s, v26.4s\n" "str q17, [x8, x27]\n" "fmax v15.4s, v15.4s, v26.4s\n" "str q25, [x8, x28]\n" "fmin v16.4s, v16.4s, v27.4s\n" "fmin v15.4s, v15.4s, v27.4s\n" "add x8, x8, #16\n" "str q16, [x25]\n" "fmax v18.4s, v18.4s, v26.4s\n" "str q15, [x25, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v26.4s\n" "fmin v18.4s, v18.4s, v27.4s\n" "fmax v14.4s, v14.4s, v26.4s\n" "fmin v24.4s, v24.4s, v27.4s\n" "fmax v13.4s, v13.4s, v26.4s\n" "str q18, [x25, x27]\n" "fmin v14.4s, v14.4s, v27.4s\n" "str q24, [x25, x28]\n" "fmin v13.4s, v13.4s, v27.4s\n" "str q14, [x26]\n" "fmax v22.4s, v22.4s, v26.4s\n" "str q13, [x26, %[output_col_stride1]]\n" "fmax v21.4s, v21.4s, v26.4s\n" "fmin v22.4s, v22.4s, v27.4s\n" "add x25, x25, #16\n" "fmin v21.4s, v21.4s, v27.4s\n" "str q22, [x26, x27]\n" "str q21, [x26, x28]\n" "add x26, x26, #16\n" "4:\n" "cbz x14, 7f\n" "ldr s23, [%[wbptr]]\n" "mov v12.16b, v23.16b\n" "ldr s20, [%[wbptr], #4]\n" "mov v8.16b, v23.16b\n" "ldr s6, [%[wbptr], #8]\n" "mov v11.16b, v23.16b\n" "ldr s5, [%[wbptr], #12]\n" "mov v16.16b, v23.16b\n" "ldr s19, [%[wbptr], #16]\n" "mov v7.16b, v23.16b\n" "ldr s4, [%[wbptr], #20]\n" "mov v10.16b, v23.16b\n" "ldr s3, [%[wbptr], #24]\n" "mov v14.16b, v23.16b\n" "ldr s2, [%[wbptr], #28]\n" "mov v15.16b, v23.16b\n" "ldr s1, [%[wbptr], #32]\n" "mov v17.16b, v23.16b\n" "ldr s0, [%[wbptr], #36]\n" "mov v9.16b, v23.16b\n" "ldr s28, [%[inptr0]]\n" "fmla v12.4s, v28.4s, v20.4s\n" "ldr s25, [x24]\n" "fmla v8.4s, v25.4s, v20.4s\n" "ldr s18, [%[inptr0], %[input_col_stride1]]\n" "fmla v11.4s, v18.4s, v20.4s\n" "ldr s30, [x9]\n" "fmla v12.4s, v25.4s, v19.4s\n" "ldr s29, [x24, %[input_col_stride1]]\n" "fmla v8.4s, v30.4s, v19.4s\n" "ldr s24, [%[inptr0], x13]\n" "fmla v16.4s, v30.4s, v20.4s\n" "ldr s27, [x20]\n" "fmla v12.4s, v18.4s, v6.4s\n" "ldr s22, [x9, %[input_col_stride1]]\n" "fmla v8.4s, v29.4s, v6.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "prfm pldl1keep, [x24, #64]\n" "subs x14, x14, #1\n" "prfm pldl1keep, [%[inptr0], x16]\n" "prfm pldl1keep, [x9, #64]\n" "fmla v12.4s, v30.4s, v2.4s\n" "prfm pldl1keep, [x24, x16]\n" "prfm pldl1keep, [%[inptr0], x10]\n" "prfm pldl1keep, [x20, #64]\n" "prfm pldl1keep, [x9, x16]\n" "fmla v12.4s, v29.4s, v4.4s\n" "beq 6f\n" "5:\n" "mov v13.16b, v23.16b\n" "ldr s21, [x24, x13]\n" "mov v18.16b, v23.16b\n" "prfm pldl1keep, [x24, x10]\n" "fmla v11.4s, v29.4s, v19.4s\n" "prfm pldl1keep, [%[inptr0], x21]\n" "fmla v7.4s, v29.4s, v20.4s\n" "ldr s25, [%[inptr0], x19]\n" "fmla v12.4s, v24.4s, v5.4s\n" "prfm pldl1keep, [x22, #64]\n" "fmla v11.4s, v24.4s, v6.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v10.4s, v24.4s, v20.4s\n" "ldr s24, [x22]\n" "fmla v8.4s, v27.4s, v2.4s\n" "prfm pldl1keep, [x9, x10]\n" "fmla v16.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, x21]\n" "fmla v14.4s, v27.4s, v20.4s\n" "ldr s26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v16.4s, v22.4s, v6.4s\n" "prfm pldl1keep, [x20, x10]\n" "fmla v7.4s, v22.4s, v19.4s\n" "prfm pldl1keep, [x9, x21]\n" "fmla v15.4s, v22.4s, v20.4s\n" "ldr s30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" "prfm pldl1keep, [x24, x18]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" "prfm pldl1keep, [x23, x16]\n" "fmla v7.4s, v21.4s, v6.4s\n" "prfm pldl1keep, [x22, x10]\n" "fmla v10.4s, v21.4s, v19.4s\n" "prfm pldl1keep, [x20, x21]\n" "fmla v17.4s, v21.4s, v20.4s\n" "ldr s22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" "ldr s21, [%[inptr0], x17]\n" "fmla v16.4s, v24.4s, v2.4s\n" "prfm pldl1keep, [x23, x10]\n" "fmla v14.4s, v24.4s, v19.4s\n" "ldr s24, [x23]\n" "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" "prfm pldl1keep, [x20, x18]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" "prfm pldl1keep, [x22, x18]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr s26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x23, x18]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" "prfm pldl1keep, [x23, x12]\n" "fmla v7.4s, v30.4s, v4.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v10.4s, v30.4s, v2.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v15.4s, v30.4s, v6.4s\n" "subs x14, x14, #1\n" "fmla v17.4s, v30.4s, v19.4s\n" "fmla v18.4s, v30.4s, v20.4s\n" "mov v25.16b, v23.16b\n" "fmla v11.4s, v22.4s, v3.4s\n" "fmla v7.4s, v22.4s, v5.4s\n" "fmla v10.4s, v22.4s, v4.4s\n" "fmla v17.4s, v22.4s, v6.4s\n" "fmla v9.4s, v22.4s, v19.4s\n" "fmla v25.4s, v22.4s, v20.4s\n" "ldr s27, [x20, x13]\n" "fmla v10.4s, v21.4s, v5.4s\n" "fmla v14.4s, v24.4s, v2.4s\n" "mov v22.16b, v23.16b\n" "fmla v9.4s, v21.4s, v6.4s\n" "mov v24.16b, v23.16b\n" "mov v21.16b, v23.16b\n" "fmla v16.4s, v26.4s, v1.4s\n" "fmla v14.4s, v26.4s, v4.4s\n" "fmla v15.4s, v26.4s, v2.4s\n" "fmla v13.4s, v26.4s, v19.4s\n" "fmla v8.4s, v27.4s, v0.4s\n" "ldr s28, [x9, x19]\n" "fmla v16.4s, v27.4s, v3.4s\n" "fmla v7.4s, v27.4s, v1.4s\n" "fmla v14.4s, v27.4s, v5.4s\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v17.4s, v27.4s, v2.4s\n" "fmla v13.4s, v27.4s, v6.4s\n" "fmla v18.4s, v27.4s, v19.4s\n" "fmla v22.4s, v27.4s, v20.4s\n" "fmla v11.4s, v28.4s, v0.4s\n" "ldr s29, [x24, x17]\n" "fmla v7.4s, v28.4s, v3.4s\n" "fmla v10.4s, v28.4s, v1.4s\n" "fmla v15.4s, v28.4s, v5.4s\n" "fmla v17.4s, v28.4s, v4.4s\n" "fmla v9.4s, v28.4s, v2.4s\n" "fmla v18.4s, v28.4s, v6.4s\n" "fmla v25.4s, v28.4s, v19.4s\n" "fmla v24.4s, v28.4s, v20.4s\n" "fmla v10.4s, v29.4s, v3.4s\n" "ldr s23, [%[inptr0], x11]\n" "fmla v17.4s, v29.4s, v5.4s\n" "add %[inptr0], %[inptr0], #4\n" "fmla v9.4s, v29.4s, v4.4s\n" "prfm pldl1keep, [%[inptr0], #64]\n" "fmla v25.4s, v29.4s, v6.4s\n" "ldr s30, [x23, %[input_col_stride1]]\n" "fmla v14.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [%[inptr0], x16]\n" "fmla v9.4s, v23.4s, v5.4s\n" "ldr s23, [x22, x13]\n" "fmla v13.4s, v30.4s, v2.4s\n" "ldr s29, [x20, x19]\n" "fmla v16.4s, v23.4s, v0.4s\n" "prfm pldl1keep, [%[inptr0], x10]\n" "fmla v14.4s, v23.4s, v3.4s\n" "fmla v15.4s, v23.4s, v1.4s\n" "fmla v13.4s, v23.4s, v4.4s\n" "fmla v18.4s, v23.4s, v2.4s\n" "fmla v22.4s, v23.4s, v19.4s\n" "ldr s23, [x9, x17]\n" "fmla v7.4s, v29.4s, v0.4s\n" "fmla v15.4s, v29.4s, v3.4s\n" "fmla v17.4s, v29.4s, v1.4s\n" "fmla v13.4s, v29.4s, v5.4s\n" "fmla v18.4s, v29.4s, v4.4s\n" "fmla v25.4s, v29.4s, v2.4s\n" "fmla v22.4s, v29.4s, v6.4s\n" "fmla v24.4s, v29.4s, v19.4s\n" "fmla v21.4s, v29.4s, v20.4s\n" "ldr s26, [x24, x11]\n" "fmla v10.4s, v23.4s, v0.4s\n" "ldr s28, [x23, x13]\n" "fmla v17.4s, v23.4s, v3.4s\n" "add x24, x24, #4\n" "fmla v9.4s, v23.4s, v1.4s\n" "prfm pldl1keep, [x24, #64]\n" "fmla v18.4s, v23.4s, v5.4s\n" "prfm pldl1keep, [x24, x16]\n" "fmla v25.4s, v23.4s, v4.4s\n" "fmla v24.4s, v23.4s, v6.4s\n" "fmla v9.4s, v26.4s, v3.4s\n" "ldr s20, [x22, x19]\n" "fmla v14.4s, v28.4s, v0.4s\n" "fmla v13.4s, v28.4s, v1.4s\n" "fmla v25.4s, v26.4s, v5.4s\n" "ldr s26, [x20, x17]\n" "fmla v22.4s, v28.4s, v2.4s\n" "ldr s23, [x9, x11]\n" "fmla v15.4s, v20.4s, v0.4s\n" "add x9, x9, #4\n" "fmla v13.4s, v20.4s, v3.4s\n" "prfm pldl1keep, [x9, #64]\n" "fmla v18.4s, v20.4s, v1.4s\n" "prfm pldl1keep, [x9, x16]\n" "fmla v22.4s, v20.4s, v4.4s\n" "fmla v24.4s, v20.4s, v2.4s\n" "fmla v21.4s, v20.4s, v19.4s\n" "ldr s27, [x23, x19]\n" "fmla v17.4s, v26.4s, v0.4s\n" "ldr s20, [x22, x17]\n" "fmla v18.4s, v26.4s, v3.4s\n" "fmla v25.4s, v26.4s, v1.4s\n" "fmla v22.4s, v26.4s, v5.4s\n" "fmla v24.4s, v26.4s, v4.4s\n" "fmla v21.4s, v26.4s, v6.4s\n" "ldr s19, [x20, x11]\n" "fmla v9.4s, v23.4s, v0.4s\n" "ldr s28, [x23, x17]\n" "fmla v25.4s, v23.4s, v3.4s\n" "add x20, x20, #4\n" "fmla v24.4s, v23.4s, v5.4s\n" "ldr s29, [x22, x11]\n" "fmla v13.4s, v27.4s, v0.4s\n" "prfm pldl1keep, [x20, #64]\n" "fmla v22.4s, v27.4s, v1.4s\n" "add x22, x22, #4\n" "fmla v21.4s, v27.4s, v2.4s\n" "ldr s30, [x23, x11]\n" "fmla v18.4s, v20.4s, v0.4s\n" "ldr s23, [%[wbptr]]\n" "fmla v22.4s, v20.4s, v3.4s\n" "add x23, x23, #4\n" "fmla v24.4s, v20.4s, v1.4s\n" "fmla v21.4s, v20.4s, v4.4s\n" "fmla v25.4s, v19.4s, v0.4s\n" "ldr s20, [%[wbptr], #4]\n" "fmla v22.4s, v28.4s, v0.4s\n" "ldr s6, [%[wbptr], #8]\n" "fmla v21.4s, v19.4s, v5.4s\n" "movi v26.16b, #0\n" "fmla v24.4s, v19.4s, v3.4s\n" "ldr s19, [%[wbptr], #16]\n" "fmax v12.4s, v12.4s, v26.4s\n" "fmax v11.4s, v11.4s, v26.4s\n" "fmla v21.4s, v28.4s, v1.4s\n" "ldr s5, [%[wbptr], #12]\n" "fmla v24.4s, v29.4s, v0.4s\n" "ldr s4, [%[wbptr], #20]\n" "fmax v10.4s, v10.4s, v26.4s\n" "fmax v9.4s, v9.4s, v26.4s\n" "fmla v21.4s, v29.4s, v3.4s\n" "ldr s2, [%[wbptr], #28]\n" "fmov v27.4s, #6.0\n" "fmax v8.4s, v8.4s, v26.4s\n" "fmax v7.4s, v7.4s, v26.4s\n" "fmax v17.4s, v17.4s, v26.4s\n" "fmla v21.4s, v30.4s, v0.4s\n" "ldr s3, [%[wbptr], #24]\n" "fmin v12.4s, v12.4s, v27.4s\n" "ldr s1, [%[wbptr], #32]\n" "fmin v11.4s, v11.4s, v27.4s\n" "fmin v10.4s, v10.4s, v27.4s\n" "str s12, [%[outptr0]]\n" "fmin v9.4s, v9.4s, v27.4s\n" "str s11, [%[outptr0], %[output_col_stride1]]\n" "fmin v8.4s, v8.4s, v27.4s\n" "str s10, [%[outptr0], x27]\n" "fmin v7.4s, v7.4s, v27.4s\n" "str s9, [%[outptr0], x28]\n" "fmin v17.4s, v17.4s, v27.4s\n" "str s8, [x8]\n" "fmax v25.4s, v25.4s, v26.4s\n" "str s7, [x8, %[output_col_stride1]]\n" "fmax v16.4s, v16.4s, v26.4s\n" "str s17, [x8, x27]\n" "fmin v25.4s, v25.4s, v27.4s\n" "fmin v16.4s, v16.4s, v27.4s\n" "ldr s0, [%[wbptr], #36]\n" "str s25, [x8, x28]\n" "fmax v15.4s, v15.4s, v26.4s\n" "str s16, [x25]\n" "fmax v18.4s, v18.4s, v26.4s\n" "fmin v15.4s, v15.4s, v27.4s\n" "ldr s28, [%[inptr0]]\n" "fmin v18.4s, v18.4s, v27.4s\n" "ldr s25, [x24]\n" "str s15, [x25, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v26.4s\n" "str s18, [x25, x27]\n" "fmax v14.4s, v14.4s, v26.4s\n" "fmin v24.4s, v24.4s, v27.4s\n" "ldr s18, [%[inptr0], %[input_col_stride1]]\n" "fmin v14.4s, v14.4s, v27.4s\n" "ldr s30, [x9]\n" "str s24, [x25, x28]\n" "fmax v13.4s, v13.4s, v26.4s\n" "str s14, [x26]\n" "fmax v22.4s, v22.4s, v26.4s\n" "fmin v13.4s, v13.4s, v27.4s\n" "ldr s29, [x24, %[input_col_stride1]]\n" "fmin v22.4s, v22.4s, v27.4s\n" "ldr s24, [%[inptr0], x13]\n" "str s13, [x26, %[output_col_stride1]]\n" "fmax v21.4s, v21.4s, v26.4s\n" "str s22, [x26, x27]\n" "mov v12.16b, v23.16b\n" "fmin v21.4s, v21.4s, v27.4s\n" "ldr s27, [x20]\n" "mov v8.16b, v23.16b\n" "ldr s22, [x9, %[input_col_stride1]]\n" "str s21, [x26, x28]\n" "mov v11.16b, v23.16b\n" "mov v16.16b, v23.16b\n" "add %[outptr0], %[outptr0], #4\n" "mov v7.16b, v23.16b\n" "add x8, x8, #4\n" "mov v10.16b, v23.16b\n" "add x25, x25, #4\n" "mov v14.16b, v23.16b\n" "add x26, x26, #4\n" "mov v15.16b, v23.16b\n" "mov v17.16b, v23.16b\n" "mov v9.16b, v23.16b\n" "fmla v12.4s, v28.4s, v20.4s\n" "fmla v8.4s, v25.4s, v20.4s\n" "fmla v11.4s, v18.4s, v20.4s\n" "fmla v16.4s, v30.4s, v20.4s\n" "fmla v12.4s, v25.4s, v19.4s\n" "fmla v8.4s, v30.4s, v19.4s\n" "fmla v12.4s, v18.4s, v6.4s\n" "fmla v8.4s, v29.4s, v6.4s\n" "fmla v12.4s, v30.4s, v2.4s\n" "fmla v12.4s, v29.4s, v4.4s\n" "bne 5b\n" "6:\n" "mov v13.16b, v23.16b\n" "ldr s21, [x24, x13]\n" "mov v18.16b, v23.16b\n" "prfm pldl1keep, [x24, x10]\n" "fmla v11.4s, v29.4s, v19.4s\n" "prfm pldl1keep, [%[inptr0], x21]\n" "fmla v7.4s, v29.4s, v20.4s\n" "ldr s25, [%[inptr0], x19]\n" "fmla v12.4s, v24.4s, v5.4s\n" "prfm pldl1keep, [x22, #64]\n" "fmla v11.4s, v24.4s, v6.4s\n" "prfm pldl1keep, [x20, x16]\n" "fmla v10.4s, v24.4s, v20.4s\n" "ldr s24, [x22]\n" "fmla v8.4s, v27.4s, v2.4s\n" "prfm pldl1keep, [x9, x10]\n" "fmla v16.4s, v27.4s, v19.4s\n" "prfm pldl1keep, [x24, x21]\n" "fmla v14.4s, v27.4s, v20.4s\n" "ldr s26, [x20, %[input_col_stride1]]\n" "fmla v12.4s, v22.4s, v1.4s\n" "prfm pldl1keep, [%[inptr0], x18]\n" "fmla v8.4s, v22.4s, v4.4s\n" "prfm pldl1keep, [x23, #64]\n" "fmla v11.4s, v22.4s, v2.4s\n" "prfm pldl1keep, [x22, x16]\n" "fmla v16.4s, v22.4s, v6.4s\n" "prfm pldl1keep, [x20, x10]\n" "fmla v7.4s, v22.4s, v19.4s\n" "prfm pldl1keep, [x9, x21]\n" "fmla v15.4s, v22.4s, v20.4s\n" "ldr s30, [x9, x13]\n" "fmla v12.4s, v21.4s, v3.4s\n" "prfm pldl1keep, [x24, x18]\n" "fmla v8.4s, v21.4s, v5.4s\n" "prfm pldl1keep, [%[inptr0], x12]\n" "fmla v11.4s, v21.4s, v4.4s\n" "prfm pldl1keep, [x23, x16]\n" "fmla v7.4s, v21.4s, v6.4s\n" "prfm pldl1keep, [x22, x10]\n" "fmla v10.4s, v21.4s, v19.4s\n" "prfm pldl1keep, [x20, x21]\n" "fmla v17.4s, v21.4s, v20.4s\n" "ldr s22, [x24, x19]\n" "fmla v11.4s, v25.4s, v5.4s\n" "prfm pldl1keep, [x9, x18]\n" "fmla v10.4s, v25.4s, v6.4s\n" "prfm pldl1keep, [x24, x12]\n" "fmla v9.4s, v25.4s, v20.4s\n" "ldr s21, [%[inptr0], x17]\n" "fmla v16.4s, v24.4s, v2.4s\n" "prfm pldl1keep, [x23, x10]\n" "fmla v14.4s, v24.4s, v19.4s\n" "ldr s24, [x23]\n" "fmla v8.4s, v26.4s, v1.4s\n" "prfm pldl1keep, [x22, x21]\n" "fmla v16.4s, v26.4s, v4.4s\n" "prfm pldl1keep, [x20, x18]\n" "fmla v7.4s, v26.4s, v2.4s\n" "prfm pldl1keep, [x9, x12]\n" "fmla v14.4s, v26.4s, v6.4s\n" "prfm pldl1keep, [x23, x21]\n" "fmla v15.4s, v26.4s, v19.4s\n" "prfm pldl1keep, [x22, x18]\n" "fmla v13.4s, v26.4s, v20.4s\n" "ldr s26, [x22, %[input_col_stride1]]\n" "fmla v12.4s, v30.4s, v0.4s\n" "prfm pldl1keep, [x20, x12]\n" "fmla v8.4s, v30.4s, v3.4s\n" "prfm pldl1keep, [x23, x18]\n" "fmla v11.4s, v30.4s, v1.4s\n" "prfm pldl1keep, [x22, x12]\n" "fmla v16.4s, v30.4s, v5.4s\n" "prfm pldl1keep, [x23, x12]\n" "fmla v7.4s, v30.4s, v4.4s\n" "add %[wbptr], %[wbptr], #40\n" "fmla v10.4s, v30.4s, v2.4s\n" "prfm pldl1keep, [%[wbptr], #64]\n" "fmla v15.4s, v30.4s, v6.4s\n" "fmla v17.4s, v30.4s, v19.4s\n" "fmla v18.4s, v30.4s, v20.4s\n" "ldr s27, [x20, x13]\n" "fmla v11.4s, v22.4s, v3.4s\n" "fmla v7.4s, v22.4s, v5.4s\n" "fmla v10.4s, v22.4s, v4.4s\n" "fmla v17.4s, v22.4s, v6.4s\n" "fmla v9.4s, v22.4s, v19.4s\n" "fmla v14.4s, v24.4s, v2.4s\n" "mov v25.16b, v23.16b\n" "fmla v16.4s, v26.4s, v1.4s\n" "fmla v10.4s, v21.4s, v5.4s\n" "fmla v15.4s, v26.4s, v2.4s\n" "fmla v25.4s, v22.4s, v20.4s\n" "ldr s28, [x9, x19]\n" "fmla v9.4s, v21.4s, v6.4s\n" "ldr s29, [x24, x17]\n" "fmla v14.4s, v26.4s, v4.4s\n" "fmla v13.4s, v26.4s, v19.4s\n" "mov v22.16b, v23.16b\n" "fmla v8.4s, v27.4s, v0.4s\n" "fmla v16.4s, v27.4s, v3.4s\n" "fmla v7.4s, v27.4s, v1.4s\n" "fmla v14.4s, v27.4s, v5.4s\n" "fmla v15.4s, v27.4s, v4.4s\n" "fmla v17.4s, v27.4s, v2.4s\n" "fmla v13.4s, v27.4s, v6.4s\n" "fmla v18.4s, v27.4s, v19.4s\n" "fmla v22.4s, v27.4s, v20.4s\n" "mov v24.16b, v23.16b\n" "mov v21.16b, v23.16b\n" "fmla v11.4s, v28.4s, v0.4s\n" "fmla v7.4s, v28.4s, v3.4s\n" "fmla v10.4s, v28.4s, v1.4s\n" "fmla v15.4s, v28.4s, v5.4s\n" "fmla v17.4s, v28.4s, v4.4s\n" "fmla v9.4s, v28.4s, v2.4s\n" "fmla v18.4s, v28.4s, v6.4s\n" "fmla v25.4s, v28.4s, v19.4s\n" "fmla v24.4s, v28.4s, v20.4s\n" "ldr s23, [%[inptr0], x11]\n" "fmla v10.4s, v29.4s, v3.4s\n" "add %[inptr0], %[inptr0], #4\n" "fmla v17.4s, v29.4s, v5.4s\n" "fmla v9.4s, v29.4s, v4.4s\n" "fmla v25.4s, v29.4s, v6.4s\n" "ldr s30, [x23, %[input_col_stride1]]\n" "fmla v14.4s, v30.4s, v1.4s\n" "fmla v13.4s, v30.4s, v2.4s\n" "fmla v9.4s, v23.4s, v5.4s\n" "ldr s23, [x22, x13]\n" "fmla v16.4s, v23.4s, v0.4s\n" "ldr s29, [x20, x19]\n" "fmla v14.4s, v23.4s, v3.4s\n" "fmla v15.4s, v23.4s, v1.4s\n" "fmla v13.4s, v23.4s, v4.4s\n" "fmla v18.4s, v23.4s, v2.4s\n" "fmla v22.4s, v23.4s, v19.4s\n" "ldr s23, [x9, x17]\n" "fmla v7.4s, v29.4s, v0.4s\n" "fmla v15.4s, v29.4s, v3.4s\n" "fmla v17.4s, v29.4s, v1.4s\n" "fmla v13.4s, v29.4s, v5.4s\n" "fmla v18.4s, v29.4s, v4.4s\n" "fmla v25.4s, v29.4s, v2.4s\n" "fmla v22.4s, v29.4s, v6.4s\n" "fmla v24.4s, v29.4s, v19.4s\n" "fmla v21.4s, v29.4s, v20.4s\n" "ldr s26, [x24, x11]\n" "fmla v10.4s, v23.4s, v0.4s\n" "ldr s28, [x23, x13]\n" "fmla v17.4s, v23.4s, v3.4s\n" "add x24, x24, #4\n" "fmla v9.4s, v23.4s, v1.4s\n" "fmla v18.4s, v23.4s, v5.4s\n" "fmla v25.4s, v23.4s, v4.4s\n" "fmla v24.4s, v23.4s, v6.4s\n" "fmla v14.4s, v28.4s, v0.4s\n" "ldr s20, [x22, x19]\n" "fmla v9.4s, v26.4s, v3.4s\n" "fmla v13.4s, v28.4s, v1.4s\n" "fmla v25.4s, v26.4s, v5.4s\n" "ldr s26, [x20, x17]\n" "fmla v22.4s, v28.4s, v2.4s\n" "ldr s23, [x9, x11]\n" "fmla v15.4s, v20.4s, v0.4s\n" "add x9, x9, #4\n" "fmla v13.4s, v20.4s, v3.4s\n" "fmla v18.4s, v20.4s, v1.4s\n" "fmla v22.4s, v20.4s, v4.4s\n" "fmla v24.4s, v20.4s, v2.4s\n" "fmla v21.4s, v20.4s, v19.4s\n" "ldr s27, [x23, x19]\n" "fmla v17.4s, v26.4s, v0.4s\n" "ldr s20, [x22, x17]\n" "fmla v18.4s, v26.4s, v3.4s\n" "fmla v25.4s, v26.4s, v1.4s\n" "fmla v22.4s, v26.4s, v5.4s\n" "fmla v24.4s, v26.4s, v4.4s\n" "fmla v21.4s, v26.4s, v6.4s\n" "ldr s19, [x20, x11]\n" "fmla v9.4s, v23.4s, v0.4s\n" "ldr s28, [x23, x17]\n" "fmla v25.4s, v23.4s, v3.4s\n" "add x20, x20, #4\n" "fmla v24.4s, v23.4s, v5.4s\n" "ldr s29, [x22, x11]\n" "fmla v13.4s, v27.4s, v0.4s\n" "add x22, x22, #4\n" "fmla v22.4s, v27.4s, v1.4s\n" "fmla v21.4s, v27.4s, v2.4s\n" "fmla v18.4s, v20.4s, v0.4s\n" "ldr s30, [x23, x11]\n" "fmla v24.4s, v20.4s, v1.4s\n" "add x23, x23, #4\n" "fmla v22.4s, v20.4s, v3.4s\n" "fmla v21.4s, v20.4s, v4.4s\n" "fmla v25.4s, v19.4s, v0.4s\n" "movi v26.16b, #0\n" "fmla v24.4s, v19.4s, v3.4s\n" "fmov v27.4s, #6.0\n" "fmla v21.4s, v19.4s, v5.4s\n" "fmla v22.4s, v28.4s, v0.4s\n" "fmax v12.4s, v12.4s, v26.4s\n" "fmax v11.4s, v11.4s, v26.4s\n" "fmla v24.4s, v29.4s, v0.4s\n" "fmax v10.4s, v10.4s, v26.4s\n" "fmla v21.4s, v28.4s, v1.4s\n" "fmin v12.4s, v12.4s, v27.4s\n" "fmin v11.4s, v11.4s, v27.4s\n" "fmin v10.4s, v10.4s, v27.4s\n" "str s12, [%[outptr0]]\n" "fmax v9.4s, v9.4s, v26.4s\n" "str s11, [%[outptr0], %[output_col_stride1]]\n" "fmla v21.4s, v29.4s, v3.4s\n" "str s10, [%[outptr0], x27]\n" "fmin v9.4s, v9.4s, v27.4s\n" "fmax v8.4s, v8.4s, v26.4s\n" "fmax v7.4s, v7.4s, v26.4s\n" "str s9, [%[outptr0], x28]\n" "fmla v21.4s, v30.4s, v0.4s\n" "fmin v8.4s, v8.4s, v27.4s\n" "add %[outptr0], %[outptr0], #4\n" "fmin v7.4s, v7.4s, v27.4s\n" "fmax v17.4s, v17.4s, v26.4s\n" "str s8, [x8]\n" "fmax v25.4s, v25.4s, v26.4s\n" "str s7, [x8, %[output_col_stride1]]\n" "fmin v17.4s, v17.4s, v27.4s\n" "fmin v25.4s, v25.4s, v27.4s\n" "fmax v16.4s, v16.4s, v26.4s\n" "str s17, [x8, x27]\n" "fmax v15.4s, v15.4s, v26.4s\n" "str s25, [x8, x28]\n" "fmin v16.4s, v16.4s, v27.4s\n" "fmin v15.4s, v15.4s, v27.4s\n" "add x8, x8, #4\n" "str s16, [x25]\n" "fmax v18.4s, v18.4s, v26.4s\n" "str s15, [x25, %[output_col_stride1]]\n" "fmax v24.4s, v24.4s, v26.4s\n" "fmin v18.4s, v18.4s, v27.4s\n" "fmax v14.4s, v14.4s, v26.4s\n" "fmin v24.4s, v24.4s, v27.4s\n" "fmax v13.4s, v13.4s, v26.4s\n" "str s18, [x25, x27]\n" "fmin v14.4s, v14.4s, v27.4s\n" "str s24, [x25, x28]\n" "fmin v13.4s, v13.4s, v27.4s\n" "str s14, [x26]\n" "fmax v22.4s, v22.4s, v26.4s\n" "str s13, [x26, %[output_col_stride1]]\n" "fmax v21.4s, v21.4s, v26.4s\n" "fmin v22.4s, v22.4s, v27.4s\n" "add x25, x25, #4\n" "fmin v21.4s, v21.4s, v27.4s\n" "str s22, [x26, x27]\n" "str s21, [x26, x28]\n" "add x26, x26, #4\n" "7:\n" : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr) : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)) : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory" ); } #endif // __aarch64__ template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>; } // namespace depthwise