/* * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "arm_gemm.hpp" #include #include #if defined(__aarch64__) namespace arm_conv { namespace depthwise { void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, const int32_t *const requant_shifts, int8_t *const *const outptrs ) { struct Params { long unsigned int n_channels; const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; const int32_t *const requant_shifts; int8_t *const *const outptrs; const int8_t *inptrs[36]; Params( long unsigned int n_channels, const int8_t *const *inptrs_raw, const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, const int32_t *const requant_shifts, int8_t *const *outptrs ) : n_channels(n_channels), weights(weights), bias(bias), requant(&qp), requant_muls(requant_muls), requant_shifts(requant_shifts), outptrs(outptrs) { inptrs[0] = inptrs_raw[0]; inptrs[1] = inptrs_raw[1]; inptrs[2] = inptrs_raw[6]; inptrs[3] = inptrs_raw[7]; inptrs[4] = inptrs_raw[2]; inptrs[5] = inptrs_raw[8]; inptrs[6] = inptrs_raw[3]; inptrs[7] = inptrs_raw[4]; inptrs[8] = inptrs_raw[11]; inptrs[9] = inptrs_raw[12]; inptrs[10] = inptrs_raw[9]; inptrs[11] = inptrs_raw[10]; inptrs[12] = inptrs_raw[5]; inptrs[13] = inptrs_raw[13]; inptrs[14] = inptrs_raw[14]; inptrs[15] = inptrs_raw[15]; inptrs[16] = inptrs_raw[16]; inptrs[17] = inptrs_raw[17]; inptrs[18] = inptrs_raw[18]; inptrs[19] = inptrs_raw[19]; inptrs[20] = inptrs_raw[20]; inptrs[21] = inptrs_raw[21]; inptrs[22] = inptrs_raw[22]; inptrs[23] = inptrs_raw[23]; inptrs[24] = inptrs_raw[24]; inptrs[25] = inptrs_raw[25]; inptrs[26] = inptrs_raw[26]; inptrs[27] = inptrs_raw[27]; inptrs[28] = inptrs_raw[28]; inptrs[29] = inptrs_raw[29]; inptrs[30] = inptrs_raw[30]; inptrs[31] = inptrs_raw[31]; inptrs[32] = inptrs_raw[32]; inptrs[33] = inptrs_raw[33]; inptrs[34] = inptrs_raw[34]; inptrs[35] = inptrs_raw[35]; } }; const Params params(n_channels, inptrs, weights, bias, qp, requant_muls, requant_shifts, outptrs); __asm__ __volatile__( "ldr x10, [%x[params], %[offsetof_Params_requant]]\n" "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" "add x17, x10, %[offsetof_Requantize32_a_offset]\n" "add x9, x10, %[offsetof_Requantize32_b_offset]\n" "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n" "add x4, x10, %[offsetof_Requantize32_c_offset]\n" "add x14, x10, %[offsetof_Requantize32_minval]\n" "ldr x23, [%x[params], %[offsetof_Params_weights]]\n" "add x5, x10, %[offsetof_Requantize32_maxval]\n" "ld1r { v9.16b }, [x17]\n" "ld1r { v14.16b }, [x9]\n" "lsr x3, x0, #0x3\n" "ld1r { v18.8h }, [x4]\n" "ld1r { v11.8h }, [x14]\n" "mov x24, #0x0\n" "mov x22, #0x0\n" "ld1r { v13.8h }, [x5]\n" "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n" "add x20, %x[params], %[offsetof_Params_inptrs]\n" "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n" "ldp x16, x8, [x25, #0x0]\n" "ldp x4, x7, [x25, #0x10]\n" "cbz x3, 3f\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q15, [x19, #0x0]\n" "subs x3, x3, #0x1\n" "mov v17.16b, v15.16b\n" "ldr q16, [x19, #0x10]\n" "add x19, x19, #0x20\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr d0, [x23, #0x0]\n" "ldr d1, [x23, #0x8]\n" "ldr d2, [x23, #0x10]\n" "mov v8.16b, v16.16b\n" "mov v10.16b, v15.16b\n" "ldr d3, [x23, #0x18]\n" "ldr d4, [x23, #0x20]\n" "mov v7.16b, v16.16b\n" "mov v6.16b, v15.16b\n" "ldp x28, x6, [x20, #0x0]\n" "ldp x26, x25, [x20, #0x10]\n" "mov v5.16b, v16.16b\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ldp x5, x2, [x20, #0x20]\n" "ldp x27, x21, [x20, #0x30]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ldp x12, x19, [x20, #0x40]\n" "ldr d31, [x28, x24]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" "ldr d30, [x6, x24]\n" "ldr d29, [x26, x24]\n" "ssubl v31.8h, v31.8b, v9.8b\n" "ssubl v30.8h, v30.8b, v9.8b\n" "ldr d28, [x25, x24]\n" "ldr d27, [x5, x24]\n" "ssubl v29.8h, v29.8b, v9.8b\n" "ssubl v28.8h, v28.8b, v9.8b\n" "ldr d23, [x2, x24]\n" "ldr d25, [x27, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "ssubl v23.8h, v23.8b, v9.8b\n" "ldr d24, [x21, x24]\n" "ldr d26, [x12, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "ssubl v24.8h, v24.8b, v9.8b\n" "ldr d22, [x19, x24]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "ssubl v22.8h, v22.8b, v9.8b\n" "beq 2f\n" "1:" // Loop "smlal v15.4s, v31.4h, v0.4h\n" "smlal2 v16.4s, v31.8h, v0.8h\n" "ldr x19, [x20, #0x50]\n" "ldr d31, [x19, x24]\n" "smlal v17.4s, v30.4h, v0.4h\n" "smlal v10.4s, v29.4h, v0.4h\n" "ldr x15, [x20, #0x58]\n" "ssubl v31.8h, v31.8b, v9.8b\n" "smlal v6.4s, v28.4h, v0.4h\n" "smlal2 v8.4s, v30.8h, v0.8h\n" "ldr x19, [x20, #0x60]\n" "ldr x27, [x20, #0x68]\n" "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" "ldr x5, [x20, #0x70]\n" "ldr x11, [x20, #0x78]\n" "smlal2 v16.4s, v30.8h, v1.8h\n" "smlal2 v5.4s, v28.8h, v0.8h\n" "ldr d30, [x15, x24]\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v17.4s, v27.4h, v1.4h\n" "smlal v10.4s, v28.4h, v1.4h\n" "ldr d0, [x23, #0x28]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "smlal v6.4s, v23.4h, v1.4h\n" "smlal2 v8.4s, v27.8h, v1.8h\n" "ldr x12, [x20, #0x80]\n" "ldr x26, [x20, #0x88]\n" "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" "ldr x14, [x20, #0x90]\n" "ldr x15, [x20, #0x98]\n" "smlal2 v16.4s, v27.8h, v2.8h\n" "smlal2 v5.4s, v23.8h, v1.8h\n" "ldr d27, [x19, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal v17.4s, v25.4h, v2.4h\n" "smlal v10.4s, v23.4h, v2.4h\n" "ldr d1, [x23, #0x30]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v31.4h, v2.4h\n" "smlal2 v8.4s, v25.8h, v2.8h\n" "ldr x21, [x20, #0xa0]\n" "ldr x2, [x20, #0xa8]\n" "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" "ldr x13, [x20, #0xb0]\n" "ldr x9, [x20, #0xb8]\n" "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v31.8h, v2.8h\n" "ldr d25, [x27, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal v17.4s, v24.4h, v3.4h\n" "smlal v10.4s, v31.4h, v3.4h\n" "ldr d2, [x23, #0x38]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v30.4h, v3.4h\n" "smlal2 v8.4s, v24.8h, v3.8h\n" "ldr x19, [x20, #0xc0]\n" "ldr x28, [x20, #0xc8]\n" "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" "ldr x6, [x20, #0xd0]\n" "ldr x27, [x20, #0xd8]\n" "smlal2 v16.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v30.8h, v3.8h\n" "ldr d24, [x5, x24]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal v17.4s, v27.4h, v4.4h\n" "smlal v10.4s, v30.4h, v4.4h\n" "ldr d3, [x23, #0x40]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v26.4h, v4.4h\n" "smlal2 v8.4s, v27.8h, v4.8h\n" "ldr d27, [x11, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" "ldr x11, [x20, #0xe0]\n" "ldr x17, [x20, #0xe8]\n" "smlal2 v16.4s, v29.8h, v0.8h\n" "smlal2 v5.4s, v26.8h, v4.8h\n" "ldr d4, [x23, #0x48]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v17.4s, v28.4h, v0.4h\n" "smlal v10.4s, v22.4h, v0.4h\n" "ldr x5, [x20, #0xf0]\n" "ldr q12, [x10, #0x0]\n" "smlal v6.4s, v25.4h, v0.4h\n" "smlal2 v8.4s, v28.8h, v0.8h\n" "ldr q19, [x1, #0x0]\n" "ldr q20, [x10, #0x10]\n" "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" "ldr q29, [x1, #0x10]\n" "subs x3, x3, #0x1\n" "smlal2 v16.4s, v28.8h, v1.8h\n" "smlal2 v5.4s, v25.8h, v0.8h\n" "ldr d28, [x26, x24]\n" "ldr d0, [x23, #0x50]\n" "smlal v17.4s, v23.4h, v1.4h\n" "smlal v10.4s, v25.4h, v1.4h\n" "ssubl v28.8h, v28.8b, v9.8b\n" "ldr x25, [x20, #0xf8]\n" "smlal v6.4s, v24.4h, v1.4h\n" "smlal2 v8.4s, v23.8h, v1.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" "add x10, x10, #0x20\n" "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" "add x1, x1, #0x20\n" "smlal2 v16.4s, v23.8h, v2.8h\n" "ldr d23, [x12, x24]\n" "smlal2 v5.4s, v24.8h, v1.8h\n" "ssubl v23.8h, v23.8b, v9.8b\n" "smlal v17.4s, v31.4h, v2.4h\n" "smlal v10.4s, v24.4h, v2.4h\n" "ldr d1, [x23, #0x58]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v27.4h, v2.4h\n" "smlal2 v8.4s, v31.8h, v2.8h\n" "ldr x26, [x20, #0x100]\n" "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" "smlal2 v16.4s, v31.8h, v3.8h\n" "smlal2 v5.4s, v27.8h, v2.8h\n" "ldr d31, [x14, x24]\n" "ssubl v31.8h, v31.8b, v9.8b\n" "smlal v17.4s, v30.4h, v3.4h\n" "smlal v10.4s, v27.4h, v3.4h\n" "ldr d2, [x23, #0x60]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v23.4h, v3.4h\n" "smlal2 v8.4s, v30.8h, v3.8h\n" "ldr x12, [x20, #0x108]\n" "smlal2 v7.4s, v27.8h, v3.8h\n" "smlal v15.4s, v30.4h, v4.4h\n" "smlal2 v16.4s, v30.8h, v4.8h\n" "ldr d30, [x15, x24]\n" "smlal2 v5.4s, v23.8h, v3.8h\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v17.4s, v26.4h, v4.4h\n" "smlal v10.4s, v23.4h, v4.4h\n" "ldr d3, [x23, #0x68]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v28.4h, v4.4h\n" "smlal2 v8.4s, v26.8h, v4.8h\n" "ldr d26, [x21, x24]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "smlal2 v7.4s, v23.8h, v4.8h\n" "smlal v15.4s, v22.4h, v0.4h\n" "ldr x14, [x20, #0x110]\n" "ldr x21, [x20, #0x118]\n" "smlal2 v16.4s, v22.8h, v0.8h\n" "smlal2 v5.4s, v28.8h, v4.8h\n" "ldr d4, [x23, #0x70]\n" "ldr d22, [x9, x24]\n" "smlal v17.4s, v25.4h, v0.4h\n" "smlal v10.4s, v31.4h, v0.4h\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v6.4s, v30.4h, v0.4h\n" "smlal2 v8.4s, v25.8h, v0.8h\n" "ssubl v22.8h, v22.8b, v9.8b\n" "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" "smlal2 v16.4s, v25.8h, v1.8h\n" "ldr d25, [x2, x24]\n" "smlal2 v5.4s, v30.8h, v0.8h\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal v17.4s, v24.4h, v1.4h\n" "smlal v10.4s, v30.4h, v1.4h\n" "ldr d0, [x23, #0x78]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "smlal v6.4s, v26.4h, v1.4h\n" "smlal2 v8.4s, v24.8h, v1.8h\n" "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v16.4s, v24.8h, v2.8h\n" "ldr d24, [x13, x24]\n" "smlal2 v5.4s, v26.8h, v1.8h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal v17.4s, v27.4h, v2.4h\n" "smlal v10.4s, v26.4h, v2.4h\n" "ldr d1, [x23, #0x80]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v25.4h, v2.4h\n" "smlal2 v8.4s, v27.8h, v2.8h\n" "smlal2 v7.4s, v26.8h, v2.8h\n" "smlal v15.4s, v27.4h, v3.4h\n" "smlal2 v16.4s, v27.8h, v3.8h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" "ldr d27, [x19, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal v17.4s, v23.4h, v3.4h\n" "smlal v10.4s, v25.4h, v3.4h\n" "ldr d2, [x23, #0x88]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v24.4h, v3.4h\n" "smlal2 v8.4s, v23.8h, v3.8h\n" "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal v15.4s, v23.4h, v4.4h\n" "smlal2 v16.4s, v23.8h, v4.8h\n" "ldr d23, [x28, x24]\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "ssubl v23.8h, v23.8b, v9.8b\n" "smlal v17.4s, v28.4h, v4.4h\n" "smlal v10.4s, v24.4h, v4.4h\n" "ldr d3, [x23, #0x90]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v22.4h, v4.4h\n" "smlal2 v8.4s, v28.8h, v4.8h\n" "ldr d28, [x11, x24]\n" "ssubl v28.8h, v28.8b, v9.8b\n" "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" "smlal2 v16.4s, v31.8h, v0.8h\n" "ldr d31, [x6, x24]\n" "smlal2 v5.4s, v22.8h, v4.8h\n" "ssubl v31.8h, v31.8b, v9.8b\n" "smlal v17.4s, v30.4h, v0.4h\n" "smlal v10.4s, v27.4h, v0.4h\n" "ldr d4, [x23, #0x98]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v6.4s, v23.4h, v0.4h\n" "smlal2 v8.4s, v30.8h, v0.8h\n" "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" "smlal2 v16.4s, v30.8h, v1.8h\n" "ldr d30, [x27, x24]\n" "smlal2 v5.4s, v23.8h, v0.8h\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v17.4s, v26.4h, v1.4h\n" "smlal v10.4s, v23.4h, v1.4h\n" "ldr d0, [x23, #0xa0]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "smlal v6.4s, v31.4h, v1.4h\n" "smlal2 v8.4s, v26.8h, v1.8h\n" "smlal2 v7.4s, v23.8h, v1.8h\n" "smlal v15.4s, v26.4h, v2.4h\n" "smlal2 v16.4s, v26.8h, v2.8h\n" "smlal2 v5.4s, v31.8h, v1.8h\n" "ldr d26, [x17, x24]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "smlal v17.4s, v25.4h, v2.4h\n" "smlal v10.4s, v31.4h, v2.4h\n" "ldr d1, [x23, #0xa8]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v30.4h, v2.4h\n" "smlal2 v8.4s, v25.8h, v2.8h\n" "smlal2 v7.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v30.8h, v2.8h\n" "ldr d25, [x5, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal v17.4s, v24.4h, v3.4h\n" "smlal v10.4s, v30.4h, v3.4h\n" "ldr d2, [x23, #0xb0]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v28.4h, v3.4h\n" "smlal2 v8.4s, v24.8h, v3.8h\n" "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" "smlal2 v16.4s, v24.8h, v4.8h\n" "ldr d24, [x25, x24]\n" "smlal2 v5.4s, v28.8h, v3.8h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal v17.4s, v22.4h, v4.4h\n" "smlal v10.4s, v28.4h, v4.4h\n" "ldr d3, [x23, #0xb8]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v26.4h, v4.4h\n" "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" "smlal2 v16.4s, v27.8h, v0.8h\n" "ldr d27, [x26, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal2 v8.4s, v22.8h, v4.8h\n" "smlal2 v5.4s, v26.8h, v4.8h\n" "ldr d4, [x23, #0xc0]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v17.4s, v23.4h, v0.4h\n" "smlal v10.4s, v25.4h, v0.4h\n" "add x23, x23, #0xc8\n" "smlal v6.4s, v24.4h, v0.4h\n" "smlal2 v7.4s, v25.8h, v0.8h\n" "ldr d25, [x12, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal2 v8.4s, v23.8h, v0.8h\n" "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" "smlal v17.4s, v31.4h, v1.4h\n" "smlal v10.4s, v24.4h, v1.4h\n" "smlal v6.4s, v27.4h, v1.4h\n" "smlal2 v7.4s, v24.8h, v1.8h\n" "ldr d24, [x14, x24]\n" "smlal2 v16.4s, v23.8h, v1.8h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal2 v8.4s, v31.8h, v1.8h\n" "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" "smlal v17.4s, v30.4h, v2.4h\n" "smlal v10.4s, v27.4h, v2.4h\n" "smlal v6.4s, v25.4h, v2.4h\n" "smlal2 v7.4s, v27.8h, v2.8h\n" "ldr d27, [x21, x24]\n" "smlal2 v16.4s, v31.8h, v2.8h\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal2 v8.4s, v30.8h, v2.8h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" "smlal v17.4s, v28.4h, v3.4h\n" "smlal v10.4s, v25.4h, v3.4h\n" "smlal v6.4s, v24.4h, v3.4h\n" "smlal2 v16.4s, v30.8h, v3.8h\n" "smlal2 v8.4s, v28.8h, v3.8h\n" "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" "smlal v17.4s, v26.4h, v4.4h\n" "sqrdmulh v15.4s, v15.4s, v12.4s\n" "smlal v10.4s, v24.4h, v4.4h\n" "smlal v6.4s, v27.4h, v4.4h\n" "sqrdmulh v17.4s, v17.4s, v12.4s\n" "smlal2 v16.4s, v28.8h, v4.8h\n" "smlal2 v8.4s, v26.8h, v4.8h\n" "sqrdmulh v10.4s, v10.4s, v12.4s\n" "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v27.8h, v4.8h\n" "sqrdmulh v6.4s, v6.4s, v12.4s\n" "and v23.16b, v15.16b, v19.16b\n" "sqrdmulh v16.4s, v16.4s, v20.4s\n" "and v22.16b, v17.16b, v19.16b\n" "sqrdmulh v8.4s, v8.4s, v20.4s\n" "and v21.16b, v10.16b, v19.16b\n" "sqrdmulh v7.4s, v7.4s, v20.4s\n" "and v26.16b, v6.16b, v19.16b\n" "sqrdmulh v5.4s, v5.4s, v20.4s\n" "sshr v23.4s, v23.4s, #0x1f\n" "and v4.16b, v16.16b, v29.16b\n" "sshr v22.4s, v22.4s, #0x1f\n" "and v2.16b, v8.16b, v29.16b\n" "sshr v21.4s, v21.4s, #0x1f\n" "and v3.16b, v7.16b, v29.16b\n" "sshr v26.4s, v26.4s, #0x1f\n" "and v25.16b, v5.16b, v29.16b\n" "sqadd v15.4s, v15.4s, v23.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v21.4s\n" "sshr v3.4s, v3.4s, #0x1f\n" "sqadd v6.4s, v6.4s, v26.4s\n" "sshr v25.4s, v25.4s, #0x1f\n" "srshl v15.4s, v15.4s, v19.4s\n" "sqadd v16.4s, v16.4s, v4.4s\n" "srshl v17.4s, v17.4s, v19.4s\n" "sqadd v8.4s, v8.4s, v2.4s\n" "srshl v10.4s, v10.4s, v19.4s\n" "sqadd v7.4s, v7.4s, v3.4s\n" "srshl v6.4s, v6.4s, v19.4s\n" "sqadd v5.4s, v5.4s, v25.4s\n" "srshl v16.4s, v16.4s, v29.4s\n" "sqxtn v15.4h, v15.4s\n" "srshl v8.4s, v8.4s, v29.4s\n" "sqxtn v17.4h, v17.4s\n" "srshl v7.4s, v7.4s, v29.4s\n" "sqxtn v10.4h, v10.4s\n" "srshl v5.4s, v5.4s, v29.4s\n" "sqxtn v6.4h, v6.4s\n" "sqxtn2 v15.8h, v16.4s\n" "sqxtn2 v17.8h, v8.4s\n" "sqxtn2 v10.8h, v7.4s\n" "sqxtn2 v6.8h, v5.4s\n" "sqadd v15.8h, v15.8h, v18.8h\n" "sqadd v17.8h, v17.8h, v18.8h\n" "sqadd v10.8h, v10.8h, v18.8h\n" "sqadd v6.8h, v6.8h, v18.8h\n" "smax v15.8h, v15.8h, v11.8h\n" "smax v17.8h, v17.8h, v11.8h\n" "smax v10.8h, v10.8h, v11.8h\n" "smax v6.8h, v6.8h, v11.8h\n" "smin v15.8h, v15.8h, v13.8h\n" "smin v17.8h, v17.8h, v13.8h\n" "smin v10.8h, v10.8h, v13.8h\n" "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" "uzp1 v17.16b, v17.16b, v17.16b\n" "str d15, [x16, x22]\n" "uzp1 v10.16b, v10.16b, v10.16b\n" "uzp1 v6.16b, v6.16b, v6.16b\n" "str d17, [x8, x22]\n" "str d10, [x4, x22]\n" "str d6, [x7, x22]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q15, [x19, #0x0]\n" "add x22, x22, #0x8\n" "ldr q16, [x19, #0x10]\n" "add x19, x19, #0x20\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr d0, [x23, #0x0]\n" "ldr d1, [x23, #0x8]\n" "ldr d2, [x23, #0x10]\n" "mov v17.16b, v15.16b\n" "mov v8.16b, v16.16b\n" "ldr d3, [x23, #0x18]\n" "ldr d4, [x23, #0x20]\n" "mov v10.16b, v15.16b\n" "mov v7.16b, v16.16b\n" "ldp x28, x6, [x20, #0x0]\n" "ldp x26, x25, [x20, #0x10]\n" "mov v6.16b, v15.16b\n" "mov v5.16b, v16.16b\n" "ldp x5, x2, [x20, #0x20]\n" "ldp x27, x21, [x20, #0x30]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ldp x12, x19, [x20, #0x40]\n" "ldr d31, [x28, x24]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ldr d30, [x6, x24]\n" "ldr d29, [x26, x24]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "ssubl v31.8h, v31.8b, v9.8b\n" "ldr d28, [x25, x24]\n" "ldr d27, [x5, x24]\n" "ssubl v30.8h, v30.8b, v9.8b\n" "ssubl v29.8h, v29.8b, v9.8b\n" "ldr d23, [x2, x24]\n" "ldr d25, [x27, x24]\n" "ssubl v28.8h, v28.8b, v9.8b\n" "ssubl v27.8h, v27.8b, v9.8b\n" "ldr d24, [x21, x24]\n" "ldr d26, [x12, x24]\n" "ssubl v23.8h, v23.8b, v9.8b\n" "ssubl v25.8h, v25.8b, v9.8b\n" "ldr d22, [x19, x24]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "ssubl v26.8h, v26.8b, v9.8b\n" "ssubl v22.8h, v22.8b, v9.8b\n" "bgt 1b\n" "2:" // Tail "smlal v15.4s, v31.4h, v0.4h\n" "smlal2 v16.4s, v31.8h, v0.8h\n" "ldr x19, [x20, #0x50]\n" "ldr d31, [x19, x24]\n" "smlal v17.4s, v30.4h, v0.4h\n" "smlal v10.4s, v29.4h, v0.4h\n" "ldr x15, [x20, #0x58]\n" "ssubl v31.8h, v31.8b, v9.8b\n" "smlal v6.4s, v28.4h, v0.4h\n" "smlal2 v8.4s, v30.8h, v0.8h\n" "ldr x19, [x20, #0x60]\n" "ldr x27, [x20, #0x68]\n" "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" "ldr x5, [x20, #0x70]\n" "ldr x11, [x20, #0x78]\n" "smlal2 v16.4s, v30.8h, v1.8h\n" "smlal2 v5.4s, v28.8h, v0.8h\n" "ldr d30, [x15, x24]\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v17.4s, v27.4h, v1.4h\n" "smlal v10.4s, v28.4h, v1.4h\n" "ldr d0, [x23, #0x28]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "smlal v6.4s, v23.4h, v1.4h\n" "smlal2 v8.4s, v27.8h, v1.8h\n" "ldr x12, [x20, #0x80]\n" "ldr x26, [x20, #0x88]\n" "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" "ldr x14, [x20, #0x90]\n" "ldr x15, [x20, #0x98]\n" "smlal2 v16.4s, v27.8h, v2.8h\n" "smlal2 v5.4s, v23.8h, v1.8h\n" "ldr d27, [x19, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal v17.4s, v25.4h, v2.4h\n" "smlal v10.4s, v23.4h, v2.4h\n" "ldr d1, [x23, #0x30]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v31.4h, v2.4h\n" "smlal2 v8.4s, v25.8h, v2.8h\n" "ldr x21, [x20, #0xa0]\n" "ldr x2, [x20, #0xa8]\n" "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" "ldr x13, [x20, #0xb0]\n" "ldr x9, [x20, #0xb8]\n" "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v31.8h, v2.8h\n" "ldr d25, [x27, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal v17.4s, v24.4h, v3.4h\n" "smlal v10.4s, v31.4h, v3.4h\n" "ldr d2, [x23, #0x38]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v30.4h, v3.4h\n" "smlal2 v8.4s, v24.8h, v3.8h\n" "ldr x19, [x20, #0xc0]\n" "ldr x28, [x20, #0xc8]\n" "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" "ldr x6, [x20, #0xd0]\n" "ldr x27, [x20, #0xd8]\n" "smlal2 v16.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v30.8h, v3.8h\n" "ldr d24, [x5, x24]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal v17.4s, v27.4h, v4.4h\n" "smlal v10.4s, v30.4h, v4.4h\n" "ldr d3, [x23, #0x40]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v26.4h, v4.4h\n" "smlal2 v8.4s, v27.8h, v4.8h\n" "ldr d27, [x11, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" "ldr x11, [x20, #0xe0]\n" "ldr x17, [x20, #0xe8]\n" "smlal2 v16.4s, v29.8h, v0.8h\n" "smlal2 v5.4s, v26.8h, v4.8h\n" "ldr d4, [x23, #0x48]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v17.4s, v28.4h, v0.4h\n" "smlal v10.4s, v22.4h, v0.4h\n" "ldr x5, [x20, #0xf0]\n" "ldr x25, [x20, #0xf8]\n" "smlal v6.4s, v25.4h, v0.4h\n" "smlal2 v8.4s, v28.8h, v0.8h\n" "ldr q12, [x10, #0x0]\n" "ldr q19, [x1, #0x0]\n" "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" "ldr q20, [x10, #0x10]\n" "ldr q29, [x1, #0x10]\n" "smlal2 v16.4s, v28.8h, v1.8h\n" "smlal2 v5.4s, v25.8h, v0.8h\n" "ldr d28, [x26, x24]\n" "ldr d0, [x23, #0x50]\n" "smlal v17.4s, v23.4h, v1.4h\n" "smlal v10.4s, v25.4h, v1.4h\n" "ssubl v28.8h, v28.8b, v9.8b\n" "ldr x26, [x20, #0x100]\n" "smlal v6.4s, v24.4h, v1.4h\n" "smlal2 v8.4s, v23.8h, v1.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" "tst x0, #0x7\n" "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" "add x10, x10, #0x20\n" "add x1, x1, #0x20\n" "smlal2 v16.4s, v23.8h, v2.8h\n" "ldr d23, [x12, x24]\n" "smlal2 v5.4s, v24.8h, v1.8h\n" "ssubl v23.8h, v23.8b, v9.8b\n" "smlal v17.4s, v31.4h, v2.4h\n" "smlal v10.4s, v24.4h, v2.4h\n" "ldr d1, [x23, #0x58]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v27.4h, v2.4h\n" "smlal2 v8.4s, v31.8h, v2.8h\n" "ldr x12, [x20, #0x108]\n" "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" "smlal2 v16.4s, v31.8h, v3.8h\n" "smlal2 v5.4s, v27.8h, v2.8h\n" "ldr d31, [x14, x24]\n" "ssubl v31.8h, v31.8b, v9.8b\n" "smlal v17.4s, v30.4h, v3.4h\n" "smlal v10.4s, v27.4h, v3.4h\n" "ldr d2, [x23, #0x60]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v23.4h, v3.4h\n" "smlal2 v8.4s, v30.8h, v3.8h\n" "ldr x14, [x20, #0x110]\n" "smlal2 v7.4s, v27.8h, v3.8h\n" "smlal v15.4s, v30.4h, v4.4h\n" "smlal2 v16.4s, v30.8h, v4.8h\n" "ldr d30, [x15, x24]\n" "smlal2 v5.4s, v23.8h, v3.8h\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v17.4s, v26.4h, v4.4h\n" "smlal v10.4s, v23.4h, v4.4h\n" "ldr d3, [x23, #0x68]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v28.4h, v4.4h\n" "smlal2 v8.4s, v26.8h, v4.8h\n" "ldr d26, [x21, x24]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "smlal2 v7.4s, v23.8h, v4.8h\n" "smlal v15.4s, v22.4h, v0.4h\n" "ldr x21, [x20, #0x118]\n" "smlal2 v16.4s, v22.8h, v0.8h\n" "smlal2 v5.4s, v28.8h, v4.8h\n" "ldr d4, [x23, #0x70]\n" "ldr d22, [x9, x24]\n" "smlal v17.4s, v25.4h, v0.4h\n" "smlal v10.4s, v31.4h, v0.4h\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v6.4s, v30.4h, v0.4h\n" "smlal2 v8.4s, v25.8h, v0.8h\n" "ssubl v22.8h, v22.8b, v9.8b\n" "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" "smlal2 v16.4s, v25.8h, v1.8h\n" "ldr d25, [x2, x24]\n" "smlal2 v5.4s, v30.8h, v0.8h\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal v17.4s, v24.4h, v1.4h\n" "smlal v10.4s, v30.4h, v1.4h\n" "ldr d0, [x23, #0x78]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "smlal v6.4s, v26.4h, v1.4h\n" "smlal2 v8.4s, v24.8h, v1.8h\n" "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v16.4s, v24.8h, v2.8h\n" "ldr d24, [x13, x24]\n" "smlal2 v5.4s, v26.8h, v1.8h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal v17.4s, v27.4h, v2.4h\n" "smlal v10.4s, v26.4h, v2.4h\n" "ldr d1, [x23, #0x80]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v25.4h, v2.4h\n" "smlal2 v8.4s, v27.8h, v2.8h\n" "smlal2 v7.4s, v26.8h, v2.8h\n" "smlal v15.4s, v27.4h, v3.4h\n" "smlal2 v16.4s, v27.8h, v3.8h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" "ldr d27, [x19, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal v17.4s, v23.4h, v3.4h\n" "smlal v10.4s, v25.4h, v3.4h\n" "ldr d2, [x23, #0x88]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v24.4h, v3.4h\n" "smlal2 v8.4s, v23.8h, v3.8h\n" "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal v15.4s, v23.4h, v4.4h\n" "smlal2 v16.4s, v23.8h, v4.8h\n" "ldr d23, [x28, x24]\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "ssubl v23.8h, v23.8b, v9.8b\n" "smlal v17.4s, v28.4h, v4.4h\n" "smlal v10.4s, v24.4h, v4.4h\n" "ldr d3, [x23, #0x90]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v22.4h, v4.4h\n" "smlal2 v8.4s, v28.8h, v4.8h\n" "ldr d28, [x11, x24]\n" "ssubl v28.8h, v28.8b, v9.8b\n" "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" "smlal2 v16.4s, v31.8h, v0.8h\n" "ldr d31, [x6, x24]\n" "smlal2 v5.4s, v22.8h, v4.8h\n" "ssubl v31.8h, v31.8b, v9.8b\n" "smlal v17.4s, v30.4h, v0.4h\n" "smlal v10.4s, v27.4h, v0.4h\n" "ldr d4, [x23, #0x98]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v6.4s, v23.4h, v0.4h\n" "smlal2 v8.4s, v30.8h, v0.8h\n" "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" "smlal2 v16.4s, v30.8h, v1.8h\n" "ldr d30, [x27, x24]\n" "smlal2 v5.4s, v23.8h, v0.8h\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v17.4s, v26.4h, v1.4h\n" "smlal v10.4s, v23.4h, v1.4h\n" "ldr d0, [x23, #0xa0]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "smlal v6.4s, v31.4h, v1.4h\n" "smlal2 v8.4s, v26.8h, v1.8h\n" "smlal2 v7.4s, v23.8h, v1.8h\n" "smlal v15.4s, v26.4h, v2.4h\n" "smlal2 v16.4s, v26.8h, v2.8h\n" "smlal2 v5.4s, v31.8h, v1.8h\n" "ldr d26, [x17, x24]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "smlal v17.4s, v25.4h, v2.4h\n" "smlal v10.4s, v31.4h, v2.4h\n" "ldr d1, [x23, #0xa8]\n" "ssubl v1.8h, v1.8b, v14.8b\n" "smlal v6.4s, v30.4h, v2.4h\n" "smlal2 v8.4s, v25.8h, v2.8h\n" "smlal2 v7.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v30.8h, v2.8h\n" "ldr d25, [x5, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal v17.4s, v24.4h, v3.4h\n" "smlal v10.4s, v30.4h, v3.4h\n" "ldr d2, [x23, #0xb0]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "smlal v6.4s, v28.4h, v3.4h\n" "smlal2 v8.4s, v24.8h, v3.8h\n" "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" "smlal2 v16.4s, v24.8h, v4.8h\n" "ldr d24, [x25, x24]\n" "smlal2 v5.4s, v28.8h, v3.8h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal v17.4s, v22.4h, v4.4h\n" "smlal v10.4s, v28.4h, v4.4h\n" "ldr d3, [x23, #0xb8]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "smlal v6.4s, v26.4h, v4.4h\n" "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" "smlal2 v16.4s, v27.8h, v0.8h\n" "ldr d27, [x26, x24]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal2 v8.4s, v22.8h, v4.8h\n" "smlal2 v5.4s, v26.8h, v4.8h\n" "ldr d4, [x23, #0xc0]\n" "ssubl v4.8h, v4.8b, v14.8b\n" "smlal v17.4s, v23.4h, v0.4h\n" "smlal v10.4s, v25.4h, v0.4h\n" "smlal v6.4s, v24.4h, v0.4h\n" "smlal2 v7.4s, v25.8h, v0.8h\n" "ldr d25, [x12, x24]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal2 v8.4s, v23.8h, v0.8h\n" "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" "smlal v17.4s, v31.4h, v1.4h\n" "smlal v10.4s, v24.4h, v1.4h\n" "smlal v6.4s, v27.4h, v1.4h\n" "smlal2 v7.4s, v24.8h, v1.8h\n" "ldr d24, [x14, x24]\n" "smlal2 v16.4s, v23.8h, v1.8h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal2 v8.4s, v31.8h, v1.8h\n" "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" "smlal v17.4s, v30.4h, v2.4h\n" "smlal v10.4s, v27.4h, v2.4h\n" "smlal v6.4s, v25.4h, v2.4h\n" "smlal2 v7.4s, v27.8h, v2.8h\n" "ldr d27, [x21, x24]\n" "smlal2 v16.4s, v31.8h, v2.8h\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal2 v8.4s, v30.8h, v2.8h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" "smlal v17.4s, v28.4h, v3.4h\n" "smlal v10.4s, v25.4h, v3.4h\n" "smlal v6.4s, v24.4h, v3.4h\n" "smlal2 v16.4s, v30.8h, v3.8h\n" "smlal2 v8.4s, v28.8h, v3.8h\n" "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" "smlal v17.4s, v26.4h, v4.4h\n" "sqrdmulh v15.4s, v15.4s, v12.4s\n" "smlal v10.4s, v24.4h, v4.4h\n" "smlal v6.4s, v27.4h, v4.4h\n" "sqrdmulh v17.4s, v17.4s, v12.4s\n" "smlal2 v16.4s, v28.8h, v4.8h\n" "smlal2 v8.4s, v26.8h, v4.8h\n" "sqrdmulh v10.4s, v10.4s, v12.4s\n" "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v27.8h, v4.8h\n" "sqrdmulh v6.4s, v6.4s, v12.4s\n" "and v23.16b, v15.16b, v19.16b\n" "sqrdmulh v16.4s, v16.4s, v20.4s\n" "and v22.16b, v17.16b, v19.16b\n" "sqrdmulh v8.4s, v8.4s, v20.4s\n" "and v21.16b, v10.16b, v19.16b\n" "sqrdmulh v7.4s, v7.4s, v20.4s\n" "and v26.16b, v6.16b, v19.16b\n" "sqrdmulh v5.4s, v5.4s, v20.4s\n" "sshr v23.4s, v23.4s, #0x1f\n" "and v4.16b, v16.16b, v29.16b\n" "sshr v22.4s, v22.4s, #0x1f\n" "and v2.16b, v8.16b, v29.16b\n" "sshr v21.4s, v21.4s, #0x1f\n" "and v3.16b, v7.16b, v29.16b\n" "sshr v26.4s, v26.4s, #0x1f\n" "and v25.16b, v5.16b, v29.16b\n" "sqadd v15.4s, v15.4s, v23.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v21.4s\n" "sshr v3.4s, v3.4s, #0x1f\n" "sqadd v6.4s, v6.4s, v26.4s\n" "sshr v25.4s, v25.4s, #0x1f\n" "srshl v15.4s, v15.4s, v19.4s\n" "sqadd v16.4s, v16.4s, v4.4s\n" "srshl v17.4s, v17.4s, v19.4s\n" "sqadd v8.4s, v8.4s, v2.4s\n" "srshl v10.4s, v10.4s, v19.4s\n" "sqadd v7.4s, v7.4s, v3.4s\n" "srshl v6.4s, v6.4s, v19.4s\n" "sqadd v5.4s, v5.4s, v25.4s\n" "srshl v16.4s, v16.4s, v29.4s\n" "sqxtn v15.4h, v15.4s\n" "srshl v8.4s, v8.4s, v29.4s\n" "sqxtn v17.4h, v17.4s\n" "srshl v7.4s, v7.4s, v29.4s\n" "sqxtn v10.4h, v10.4s\n" "srshl v5.4s, v5.4s, v29.4s\n" "sqxtn v6.4h, v6.4s\n" "sqxtn2 v15.8h, v16.4s\n" "sqxtn2 v17.8h, v8.4s\n" "sqxtn2 v10.8h, v7.4s\n" "sqxtn2 v6.8h, v5.4s\n" "sqadd v15.8h, v15.8h, v18.8h\n" "sqadd v17.8h, v17.8h, v18.8h\n" "sqadd v10.8h, v10.8h, v18.8h\n" "sqadd v6.8h, v6.8h, v18.8h\n" "smax v15.8h, v15.8h, v11.8h\n" "smax v17.8h, v17.8h, v11.8h\n" "smax v10.8h, v10.8h, v11.8h\n" "smax v6.8h, v6.8h, v11.8h\n" "smin v15.8h, v15.8h, v13.8h\n" "smin v17.8h, v17.8h, v13.8h\n" "smin v10.8h, v10.8h, v13.8h\n" "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" "uzp1 v17.16b, v17.16b, v17.16b\n" "str d15, [x16, x22]\n" "uzp1 v10.16b, v10.16b, v10.16b\n" "uzp1 v6.16b, v6.16b, v6.16b\n" "str d17, [x8, x22]\n" "str d10, [x4, x22]\n" "str d6, [x7, x22]\n" "add x22, x22, #0x8\n" "beq 124f\n" "add x23, x23, #0xc8\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "tbz x0, #2, 5f\n" "ld1 { v15.4s }, [x19], #0x10\n" "tbz x0, #1, 4f\n" "ld1 { v16.d }[0], [x19], #0x8\n" "tbz x0, #0, 7f\n" "ld1 { v16.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset "tbz x0, #0, 7f\n" "ld1 { v16.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset "tbz x0, #1, 6f\n" "ld1 { v15.d }[0], [x19], #0x8\n" "tbz x0, #0, 7f\n" "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 7f\n" "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End "ldr d0, [x23, #0x0]\n" "ldr d1, [x23, #0x8]\n" "mov v17.16b, v15.16b\n" "mov v8.16b, v16.16b\n" "ldr d2, [x23, #0x10]\n" "ldr d3, [x23, #0x18]\n" "mov v10.16b, v15.16b\n" "mov v7.16b, v16.16b\n" "ldr d4, [x23, #0x20]\n" "ldp x28, x6, [x20, #0x0]\n" "mov v6.16b, v15.16b\n" "mov v5.16b, v16.16b\n" "ldp x26, x25, [x20, #0x10]\n" "ldp x5, x2, [x20, #0x20]\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ldp x27, x21, [x20, #0x30]\n" "ldp x12, x19, [x20, #0x40]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" "add x28, x28, x24\n" "add x6, x6, x24\n" "add x26, x26, x24\n" "add x25, x25, x24\n" "add x5, x5, x24\n" "add x2, x2, x24\n" "add x27, x27, x24\n" "add x21, x21, x24\n" "add x12, x12, x24\n" "add x19, x19, x24\n" "tbz x0, #2, 9f\n" "ld1 { v31.s }[0], [x28], #0x4\n" "ld1 { v30.s }[0], [x6], #0x4\n" "ld1 { v29.s }[0], [x26], #0x4\n" "ld1 { v28.s }[0], [x25], #0x4\n" "ld1 { v27.s }[0], [x5], #0x4\n" "ld1 { v23.s }[0], [x2], #0x4\n" "ld1 { v25.s }[0], [x27], #0x4\n" "ld1 { v24.s }[0], [x21], #0x4\n" "ld1 { v26.s }[0], [x12], #0x4\n" "ld1 { v22.s }[0], [x19], #0x4\n" "tbz x0, #1, 8f\n" "ld1 { v31.h }[2], [x28], #0x2\n" "ld1 { v30.h }[2], [x6], #0x2\n" "ld1 { v29.h }[2], [x26], #0x2\n" "ld1 { v28.h }[2], [x25], #0x2\n" "ld1 { v27.h }[2], [x5], #0x2\n" "ld1 { v23.h }[2], [x2], #0x2\n" "ld1 { v25.h }[2], [x27], #0x2\n" "ld1 { v24.h }[2], [x21], #0x2\n" "ld1 { v26.h }[2], [x12], #0x2\n" "ld1 { v22.h }[2], [x19], #0x2\n" "tbz x0, #0, 11f\n" "ld1 { v31.b }[6], [x28]\n" "ld1 { v30.b }[6], [x6]\n" "ld1 { v29.b }[6], [x26]\n" "ld1 { v28.b }[6], [x25]\n" "ld1 { v27.b }[6], [x5]\n" "ld1 { v23.b }[6], [x2]\n" "ld1 { v25.b }[6], [x27]\n" "ld1 { v24.b }[6], [x21]\n" "ld1 { v26.b }[6], [x12]\n" "ld1 { v22.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset "tbz x0, #0, 11f\n" "ld1 { v31.b }[4], [x28]\n" "ld1 { v30.b }[4], [x6]\n" "ld1 { v29.b }[4], [x26]\n" "ld1 { v28.b }[4], [x25]\n" "ld1 { v27.b }[4], [x5]\n" "ld1 { v23.b }[4], [x2]\n" "ld1 { v25.b }[4], [x27]\n" "ld1 { v24.b }[4], [x21]\n" "ld1 { v26.b }[4], [x12]\n" "ld1 { v22.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset "tbz x0, #1, 10f\n" "ld1 { v31.h }[0], [x28], #0x2\n" "ld1 { v30.h }[0], [x6], #0x2\n" "ld1 { v29.h }[0], [x26], #0x2\n" "ld1 { v28.h }[0], [x25], #0x2\n" "ld1 { v27.h }[0], [x5], #0x2\n" "ld1 { v23.h }[0], [x2], #0x2\n" "ld1 { v25.h }[0], [x27], #0x2\n" "ld1 { v24.h }[0], [x21], #0x2\n" "ld1 { v26.h }[0], [x12], #0x2\n" "ld1 { v22.h }[0], [x19], #0x2\n" "tbz x0, #0, 11f\n" "ld1 { v31.b }[2], [x28]\n" "ld1 { v30.b }[2], [x6]\n" "ld1 { v29.b }[2], [x26]\n" "ld1 { v28.b }[2], [x25]\n" "ld1 { v27.b }[2], [x5]\n" "ld1 { v23.b }[2], [x2]\n" "ld1 { v25.b }[2], [x27]\n" "ld1 { v24.b }[2], [x21]\n" "ld1 { v26.b }[2], [x12]\n" "ld1 { v22.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 11f\n" "ld1 { v31.b }[0], [x28]\n" "ld1 { v30.b }[0], [x6]\n" "ld1 { v29.b }[0], [x26]\n" "ld1 { v28.b }[0], [x25]\n" "ld1 { v27.b }[0], [x5]\n" "ld1 { v23.b }[0], [x2]\n" "ld1 { v25.b }[0], [x27]\n" "ld1 { v24.b }[0], [x21]\n" "ld1 { v26.b }[0], [x12]\n" "ld1 { v22.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End "ssubl v31.8h, v31.8b, v9.8b\n" "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v15.4s, v31.4h, v0.4h\n" "ldr x19, [x20, #0x50]\n" "ssubl v29.8h, v29.8b, v9.8b\n" "smlal2 v16.4s, v31.8h, v0.8h\n" "smlal v17.4s, v30.4h, v0.4h\n" "smlal2 v8.4s, v30.8h, v0.8h\n" "smlal v10.4s, v29.4h, v0.4h\n" "ssubl v28.8h, v28.8b, v9.8b\n" "add x19, x19, x24\n" "smlal2 v7.4s, v29.8h, v0.8h\n" "ssubl v27.8h, v27.8b, v9.8b\n" "smlal v6.4s, v28.4h, v0.4h\n" "smlal2 v5.4s, v28.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" "ssubl v23.8h, v23.8b, v9.8b\n" "smlal2 v16.4s, v30.8h, v1.8h\n" "smlal v17.4s, v27.4h, v1.4h\n" "ssubl v25.8h, v25.8b, v9.8b\n" "smlal2 v8.4s, v27.8h, v1.8h\n" "smlal v10.4s, v28.4h, v1.4h\n" "ssubl v24.8h, v24.8b, v9.8b\n" "smlal2 v7.4s, v28.8h, v1.8h\n" "ssubl v26.8h, v26.8b, v9.8b\n" "smlal v6.4s, v23.4h, v1.4h\n" "ssubl v22.8h, v22.8b, v9.8b\n" "smlal2 v5.4s, v23.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" "smlal2 v16.4s, v27.8h, v2.8h\n" "smlal v17.4s, v25.4h, v2.4h\n" "smlal2 v8.4s, v25.8h, v2.8h\n" "smlal v10.4s, v23.4h, v2.4h\n" "smlal2 v7.4s, v23.8h, v2.8h\n" "tbz x0, #2, 13f\n" "ld1 { v31.s }[0], [x19], #0x4\n" "tbz x0, #1, 12f\n" "ld1 { v31.h }[2], [x19], #0x2\n" "tbz x0, #0, 15f\n" "ld1 { v31.b }[6], [x19]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset "tbz x0, #0, 15f\n" "ld1 { v31.b }[4], [x19]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset "tbz x0, #1, 14f\n" "ld1 { v31.h }[0], [x19], #0x2\n" "tbz x0, #0, 15f\n" "ld1 { v31.b }[2], [x19]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 15f\n" "ld1 { v31.b }[0], [x19]\n" "15:" // Oddments: Load (1, 3): Bit 2: End "ssubl v31.8h, v31.8b, v9.8b\n" "ldr x15, [x20, #0x58]\n" "smlal v6.4s, v31.4h, v2.4h\n" "smlal2 v5.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" "smlal2 v16.4s, v25.8h, v3.8h\n" "add x15, x15, x24\n" "smlal v17.4s, v24.4h, v3.4h\n" "smlal2 v8.4s, v24.8h, v3.8h\n" "smlal v10.4s, v31.4h, v3.4h\n" "smlal2 v7.4s, v31.8h, v3.8h\n" "tbz x0, #2, 17f\n" "ld1 { v30.s }[0], [x15], #0x4\n" "tbz x0, #1, 16f\n" "ld1 { v30.h }[2], [x15], #0x2\n" "tbz x0, #0, 19f\n" "ld1 { v30.b }[6], [x15]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset "tbz x0, #0, 19f\n" "ld1 { v30.b }[4], [x15]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset "tbz x0, #1, 18f\n" "ld1 { v30.h }[0], [x15], #0x2\n" "tbz x0, #0, 19f\n" "ld1 { v30.b }[2], [x15]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 19f\n" "ld1 { v30.b }[0], [x15]\n" "19:" // Oddments: Load (1, 4): Bit 2: End "ssubl v30.8h, v30.8b, v9.8b\n" "ldr x19, [x20, #0x60]\n" "smlal v6.4s, v30.4h, v3.4h\n" "smlal2 v5.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" "smlal2 v16.4s, v24.8h, v4.8h\n" "add x19, x19, x24\n" "tbz x0, #2, 21f\n" "ld1 { v27.s }[0], [x19], #0x4\n" "tbz x0, #1, 20f\n" "ld1 { v27.h }[2], [x19], #0x2\n" "tbz x0, #0, 23f\n" "ld1 { v27.b }[6], [x19]\n" "b 23f\n" "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset "tbz x0, #0, 23f\n" "ld1 { v27.b }[4], [x19]\n" "b 23f\n" "21:" // Oddments: Load (0, 5): Bit 2: Unset "tbz x0, #1, 22f\n" "ld1 { v27.h }[0], [x19], #0x2\n" "tbz x0, #0, 23f\n" "ld1 { v27.b }[2], [x19]\n" "b 23f\n" "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 23f\n" "ld1 { v27.b }[0], [x19]\n" "23:" // Oddments: Load (0, 5): Bit 2: End "ssubl v27.8h, v27.8b, v9.8b\n" "ldr d0, [x23, #0x28]\n" "smlal v17.4s, v27.4h, v4.4h\n" "smlal2 v8.4s, v27.8h, v4.8h\n" "smlal v10.4s, v30.4h, v4.4h\n" "smlal2 v7.4s, v30.8h, v4.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ldr x27, [x20, #0x68]\n" "smlal v6.4s, v26.4h, v4.4h\n" "smlal2 v5.4s, v26.8h, v4.8h\n" "add x27, x27, x24\n" "smlal v15.4s, v29.4h, v0.4h\n" "smlal2 v16.4s, v29.8h, v0.8h\n" "smlal v17.4s, v28.4h, v0.4h\n" "smlal2 v8.4s, v28.8h, v0.8h\n" "smlal v10.4s, v22.4h, v0.4h\n" "smlal2 v7.4s, v22.8h, v0.8h\n" "tbz x0, #2, 25f\n" "ld1 { v25.s }[0], [x27], #0x4\n" "tbz x0, #1, 24f\n" "ld1 { v25.h }[2], [x27], #0x2\n" "tbz x0, #0, 27f\n" "ld1 { v25.b }[6], [x27]\n" "b 27f\n" "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset "tbz x0, #0, 27f\n" "ld1 { v25.b }[4], [x27]\n" "b 27f\n" "25:" // Oddments: Load (2, 1): Bit 2: Unset "tbz x0, #1, 26f\n" "ld1 { v25.h }[0], [x27], #0x2\n" "tbz x0, #0, 27f\n" "ld1 { v25.b }[2], [x27]\n" "b 27f\n" "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 27f\n" "ld1 { v25.b }[0], [x27]\n" "27:" // Oddments: Load (2, 1): Bit 2: End "ldr d1, [x23, #0x30]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ldr x5, [x20, #0x70]\n" "smlal v6.4s, v25.4h, v0.4h\n" "smlal2 v5.4s, v25.8h, v0.8h\n" "add x5, x5, x24\n" "smlal v15.4s, v28.4h, v1.4h\n" "smlal2 v16.4s, v28.8h, v1.8h\n" "smlal v17.4s, v23.4h, v1.4h\n" "smlal2 v8.4s, v23.8h, v1.8h\n" "smlal v10.4s, v25.4h, v1.4h\n" "smlal2 v7.4s, v25.8h, v1.8h\n" "tbz x0, #2, 29f\n" "ld1 { v24.s }[0], [x5], #0x4\n" "tbz x0, #1, 28f\n" "ld1 { v24.h }[2], [x5], #0x2\n" "tbz x0, #0, 31f\n" "ld1 { v24.b }[6], [x5]\n" "b 31f\n" "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset "tbz x0, #0, 31f\n" "ld1 { v24.b }[4], [x5]\n" "b 31f\n" "29:" // Oddments: Load (2, 2): Bit 2: Unset "tbz x0, #1, 30f\n" "ld1 { v24.h }[0], [x5], #0x2\n" "tbz x0, #0, 31f\n" "ld1 { v24.b }[2], [x5]\n" "b 31f\n" "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 31f\n" "ld1 { v24.b }[0], [x5]\n" "31:" // Oddments: Load (2, 2): Bit 2: End "ldr d2, [x23, #0x38]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ldr x11, [x20, #0x78]\n" "smlal v6.4s, v24.4h, v1.4h\n" "smlal2 v5.4s, v24.8h, v1.8h\n" "add x11, x11, x24\n" "smlal v15.4s, v23.4h, v2.4h\n" "smlal2 v16.4s, v23.8h, v2.8h\n" "smlal v17.4s, v31.4h, v2.4h\n" "smlal2 v8.4s, v31.8h, v2.8h\n" "smlal v10.4s, v24.4h, v2.4h\n" "smlal2 v7.4s, v24.8h, v2.8h\n" "tbz x0, #2, 33f\n" "ld1 { v27.s }[0], [x11], #0x4\n" "tbz x0, #1, 32f\n" "ld1 { v27.h }[2], [x11], #0x2\n" "tbz x0, #0, 35f\n" "ld1 { v27.b }[6], [x11]\n" "b 35f\n" "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset "tbz x0, #0, 35f\n" "ld1 { v27.b }[4], [x11]\n" "b 35f\n" "33:" // Oddments: Load (2, 3): Bit 2: Unset "tbz x0, #1, 34f\n" "ld1 { v27.h }[0], [x11], #0x2\n" "tbz x0, #0, 35f\n" "ld1 { v27.b }[2], [x11]\n" "b 35f\n" "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 35f\n" "ld1 { v27.b }[0], [x11]\n" "35:" // Oddments: Load (2, 3): Bit 2: End "ldr d3, [x23, #0x40]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ldr x12, [x20, #0x80]\n" "smlal v6.4s, v27.4h, v2.4h\n" "smlal2 v5.4s, v27.8h, v2.8h\n" "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v3.4h\n" "smlal2 v16.4s, v31.8h, v3.8h\n" "smlal v17.4s, v30.4h, v3.4h\n" "smlal2 v8.4s, v30.8h, v3.8h\n" "smlal v10.4s, v27.4h, v3.4h\n" "smlal2 v7.4s, v27.8h, v3.8h\n" "tbz x0, #2, 37f\n" "ld1 { v23.s }[0], [x12], #0x4\n" "tbz x0, #1, 36f\n" "ld1 { v23.h }[2], [x12], #0x2\n" "tbz x0, #0, 39f\n" "ld1 { v23.b }[6], [x12]\n" "b 39f\n" "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset "tbz x0, #0, 39f\n" "ld1 { v23.b }[4], [x12]\n" "b 39f\n" "37:" // Oddments: Load (2, 4): Bit 2: Unset "tbz x0, #1, 38f\n" "ld1 { v23.h }[0], [x12], #0x2\n" "tbz x0, #0, 39f\n" "ld1 { v23.b }[2], [x12]\n" "b 39f\n" "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 39f\n" "ld1 { v23.b }[0], [x12]\n" "39:" // Oddments: Load (2, 4): Bit 2: End "ldr d4, [x23, #0x48]\n" "ssubl v23.8h, v23.8b, v9.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" "ldr x26, [x20, #0x88]\n" "smlal v6.4s, v23.4h, v3.4h\n" "smlal2 v5.4s, v23.8h, v3.8h\n" "add x26, x26, x24\n" "smlal v15.4s, v30.4h, v4.4h\n" "smlal2 v16.4s, v30.8h, v4.8h\n" "smlal v17.4s, v26.4h, v4.4h\n" "smlal2 v8.4s, v26.8h, v4.8h\n" "smlal v10.4s, v23.4h, v4.4h\n" "smlal2 v7.4s, v23.8h, v4.8h\n" "tbz x0, #2, 41f\n" "ld1 { v28.s }[0], [x26], #0x4\n" "tbz x0, #1, 40f\n" "ld1 { v28.h }[2], [x26], #0x2\n" "tbz x0, #0, 43f\n" "ld1 { v28.b }[6], [x26]\n" "b 43f\n" "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset "tbz x0, #0, 43f\n" "ld1 { v28.b }[4], [x26]\n" "b 43f\n" "41:" // Oddments: Load (2, 5): Bit 2: Unset "tbz x0, #1, 42f\n" "ld1 { v28.h }[0], [x26], #0x2\n" "tbz x0, #0, 43f\n" "ld1 { v28.b }[2], [x26]\n" "b 43f\n" "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 43f\n" "ld1 { v28.b }[0], [x26]\n" "43:" // Oddments: Load (2, 5): Bit 2: End "ldr d0, [x23, #0x50]\n" "ssubl v28.8h, v28.8b, v9.8b\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ldr x14, [x20, #0x90]\n" "smlal v6.4s, v28.4h, v4.4h\n" "smlal2 v5.4s, v28.8h, v4.8h\n" "add x14, x14, x24\n" "smlal v15.4s, v22.4h, v0.4h\n" "smlal2 v16.4s, v22.8h, v0.8h\n" "smlal v17.4s, v25.4h, v0.4h\n" "smlal2 v8.4s, v25.8h, v0.8h\n" "tbz x0, #2, 45f\n" "ld1 { v31.s }[0], [x14], #0x4\n" "tbz x0, #1, 44f\n" "ld1 { v31.h }[2], [x14], #0x2\n" "tbz x0, #0, 47f\n" "ld1 { v31.b }[6], [x14]\n" "b 47f\n" "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset "tbz x0, #0, 47f\n" "ld1 { v31.b }[4], [x14]\n" "b 47f\n" "45:" // Oddments: Load (3, 0): Bit 2: Unset "tbz x0, #1, 46f\n" "ld1 { v31.h }[0], [x14], #0x2\n" "tbz x0, #0, 47f\n" "ld1 { v31.b }[2], [x14]\n" "b 47f\n" "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 47f\n" "ld1 { v31.b }[0], [x14]\n" "47:" // Oddments: Load (3, 0): Bit 2: End "ssubl v31.8h, v31.8b, v9.8b\n" "ldr x15, [x20, #0x98]\n" "smlal v10.4s, v31.4h, v0.4h\n" "smlal2 v7.4s, v31.8h, v0.8h\n" "add x15, x15, x24\n" "tbz x0, #2, 49f\n" "ld1 { v30.s }[0], [x15], #0x4\n" "tbz x0, #1, 48f\n" "ld1 { v30.h }[2], [x15], #0x2\n" "tbz x0, #0, 51f\n" "ld1 { v30.b }[6], [x15]\n" "b 51f\n" "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset "tbz x0, #0, 51f\n" "ld1 { v30.b }[4], [x15]\n" "b 51f\n" "49:" // Oddments: Load (3, 1): Bit 2: Unset "tbz x0, #1, 50f\n" "ld1 { v30.h }[0], [x15], #0x2\n" "tbz x0, #0, 51f\n" "ld1 { v30.b }[2], [x15]\n" "b 51f\n" "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 51f\n" "ld1 { v30.b }[0], [x15]\n" "51:" // Oddments: Load (3, 1): Bit 2: End "ldr d1, [x23, #0x58]\n" "ssubl v30.8h, v30.8b, v9.8b\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ldr x21, [x20, #0xa0]\n" "smlal v6.4s, v30.4h, v0.4h\n" "smlal2 v5.4s, v30.8h, v0.8h\n" "add x21, x21, x24\n" "smlal v15.4s, v25.4h, v1.4h\n" "smlal2 v16.4s, v25.8h, v1.8h\n" "smlal v17.4s, v24.4h, v1.4h\n" "smlal2 v8.4s, v24.8h, v1.8h\n" "smlal v10.4s, v30.4h, v1.4h\n" "smlal2 v7.4s, v30.8h, v1.8h\n" "tbz x0, #2, 53f\n" "ld1 { v26.s }[0], [x21], #0x4\n" "tbz x0, #1, 52f\n" "ld1 { v26.h }[2], [x21], #0x2\n" "tbz x0, #0, 55f\n" "ld1 { v26.b }[6], [x21]\n" "b 55f\n" "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset "tbz x0, #0, 55f\n" "ld1 { v26.b }[4], [x21]\n" "b 55f\n" "53:" // Oddments: Load (3, 2): Bit 2: Unset "tbz x0, #1, 54f\n" "ld1 { v26.h }[0], [x21], #0x2\n" "tbz x0, #0, 55f\n" "ld1 { v26.b }[2], [x21]\n" "b 55f\n" "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 55f\n" "ld1 { v26.b }[0], [x21]\n" "55:" // Oddments: Load (3, 2): Bit 2: End "ldr d2, [x23, #0x60]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ldr x2, [x20, #0xa8]\n" "smlal v6.4s, v26.4h, v1.4h\n" "smlal2 v5.4s, v26.8h, v1.8h\n" "add x2, x2, x24\n" "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v16.4s, v24.8h, v2.8h\n" "smlal v17.4s, v27.4h, v2.4h\n" "smlal2 v8.4s, v27.8h, v2.8h\n" "smlal v10.4s, v26.4h, v2.4h\n" "smlal2 v7.4s, v26.8h, v2.8h\n" "tbz x0, #2, 57f\n" "ld1 { v25.s }[0], [x2], #0x4\n" "tbz x0, #1, 56f\n" "ld1 { v25.h }[2], [x2], #0x2\n" "tbz x0, #0, 59f\n" "ld1 { v25.b }[6], [x2]\n" "b 59f\n" "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset "tbz x0, #0, 59f\n" "ld1 { v25.b }[4], [x2]\n" "b 59f\n" "57:" // Oddments: Load (3, 3): Bit 2: Unset "tbz x0, #1, 58f\n" "ld1 { v25.h }[0], [x2], #0x2\n" "tbz x0, #0, 59f\n" "ld1 { v25.b }[2], [x2]\n" "b 59f\n" "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 59f\n" "ld1 { v25.b }[0], [x2]\n" "59:" // Oddments: Load (3, 3): Bit 2: End "ldr d3, [x23, #0x68]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ldr x13, [x20, #0xb0]\n" "smlal v6.4s, v25.4h, v2.4h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" "add x13, x13, x24\n" "smlal v15.4s, v27.4h, v3.4h\n" "smlal2 v16.4s, v27.8h, v3.8h\n" "smlal v17.4s, v23.4h, v3.4h\n" "smlal2 v8.4s, v23.8h, v3.8h\n" "smlal v10.4s, v25.4h, v3.4h\n" "smlal2 v7.4s, v25.8h, v3.8h\n" "tbz x0, #2, 61f\n" "ld1 { v24.s }[0], [x13], #0x4\n" "tbz x0, #1, 60f\n" "ld1 { v24.h }[2], [x13], #0x2\n" "tbz x0, #0, 63f\n" "ld1 { v24.b }[6], [x13]\n" "b 63f\n" "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset "tbz x0, #0, 63f\n" "ld1 { v24.b }[4], [x13]\n" "b 63f\n" "61:" // Oddments: Load (3, 4): Bit 2: Unset "tbz x0, #1, 62f\n" "ld1 { v24.h }[0], [x13], #0x2\n" "tbz x0, #0, 63f\n" "ld1 { v24.b }[2], [x13]\n" "b 63f\n" "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 63f\n" "ld1 { v24.b }[0], [x13]\n" "63:" // Oddments: Load (3, 4): Bit 2: End "ldr d4, [x23, #0x70]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" "ldr x9, [x20, #0xb8]\n" "smlal v6.4s, v24.4h, v3.4h\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "add x9, x9, x24\n" "smlal v15.4s, v23.4h, v4.4h\n" "smlal2 v16.4s, v23.8h, v4.8h\n" "smlal v17.4s, v28.4h, v4.4h\n" "smlal2 v8.4s, v28.8h, v4.8h\n" "smlal v10.4s, v24.4h, v4.4h\n" "smlal2 v7.4s, v24.8h, v4.8h\n" "tbz x0, #2, 65f\n" "ld1 { v22.s }[0], [x9], #0x4\n" "tbz x0, #1, 64f\n" "ld1 { v22.h }[2], [x9], #0x2\n" "tbz x0, #0, 67f\n" "ld1 { v22.b }[6], [x9]\n" "b 67f\n" "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset "tbz x0, #0, 67f\n" "ld1 { v22.b }[4], [x9]\n" "b 67f\n" "65:" // Oddments: Load (3, 5): Bit 2: Unset "tbz x0, #1, 66f\n" "ld1 { v22.h }[0], [x9], #0x2\n" "tbz x0, #0, 67f\n" "ld1 { v22.b }[2], [x9]\n" "b 67f\n" "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 67f\n" "ld1 { v22.b }[0], [x9]\n" "67:" // Oddments: Load (3, 5): Bit 2: End "ldr d0, [x23, #0x78]\n" "ssubl v22.8h, v22.8b, v9.8b\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ldr x19, [x20, #0xc0]\n" "smlal v6.4s, v22.4h, v4.4h\n" "smlal2 v5.4s, v22.8h, v4.8h\n" "add x19, x19, x24\n" "smlal v15.4s, v31.4h, v0.4h\n" "smlal2 v16.4s, v31.8h, v0.8h\n" "smlal v17.4s, v30.4h, v0.4h\n" "smlal2 v8.4s, v30.8h, v0.8h\n" "tbz x0, #2, 69f\n" "ld1 { v27.s }[0], [x19], #0x4\n" "tbz x0, #1, 68f\n" "ld1 { v27.h }[2], [x19], #0x2\n" "tbz x0, #0, 71f\n" "ld1 { v27.b }[6], [x19]\n" "b 71f\n" "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset "tbz x0, #0, 71f\n" "ld1 { v27.b }[4], [x19]\n" "b 71f\n" "69:" // Oddments: Load (4, 0): Bit 2: Unset "tbz x0, #1, 70f\n" "ld1 { v27.h }[0], [x19], #0x2\n" "tbz x0, #0, 71f\n" "ld1 { v27.b }[2], [x19]\n" "b 71f\n" "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 71f\n" "ld1 { v27.b }[0], [x19]\n" "71:" // Oddments: Load (4, 0): Bit 2: End "ssubl v27.8h, v27.8b, v9.8b\n" "ldr x28, [x20, #0xc8]\n" "smlal v10.4s, v27.4h, v0.4h\n" "smlal2 v7.4s, v27.8h, v0.8h\n" "add x28, x28, x24\n" "tbz x0, #2, 73f\n" "ld1 { v23.s }[0], [x28], #0x4\n" "tbz x0, #1, 72f\n" "ld1 { v23.h }[2], [x28], #0x2\n" "tbz x0, #0, 75f\n" "ld1 { v23.b }[6], [x28]\n" "b 75f\n" "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset "tbz x0, #0, 75f\n" "ld1 { v23.b }[4], [x28]\n" "b 75f\n" "73:" // Oddments: Load (4, 1): Bit 2: Unset "tbz x0, #1, 74f\n" "ld1 { v23.h }[0], [x28], #0x2\n" "tbz x0, #0, 75f\n" "ld1 { v23.b }[2], [x28]\n" "b 75f\n" "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 75f\n" "ld1 { v23.b }[0], [x28]\n" "75:" // Oddments: Load (4, 1): Bit 2: End "ldr d1, [x23, #0x80]\n" "ssubl v23.8h, v23.8b, v9.8b\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ldr x6, [x20, #0xd0]\n" "smlal v6.4s, v23.4h, v0.4h\n" "smlal2 v5.4s, v23.8h, v0.8h\n" "add x6, x6, x24\n" "smlal v15.4s, v30.4h, v1.4h\n" "smlal2 v16.4s, v30.8h, v1.8h\n" "smlal v17.4s, v26.4h, v1.4h\n" "smlal2 v8.4s, v26.8h, v1.8h\n" "smlal v10.4s, v23.4h, v1.4h\n" "smlal2 v7.4s, v23.8h, v1.8h\n" "tbz x0, #2, 77f\n" "ld1 { v31.s }[0], [x6], #0x4\n" "tbz x0, #1, 76f\n" "ld1 { v31.h }[2], [x6], #0x2\n" "tbz x0, #0, 79f\n" "ld1 { v31.b }[6], [x6]\n" "b 79f\n" "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset "tbz x0, #0, 79f\n" "ld1 { v31.b }[4], [x6]\n" "b 79f\n" "77:" // Oddments: Load (4, 2): Bit 2: Unset "tbz x0, #1, 78f\n" "ld1 { v31.h }[0], [x6], #0x2\n" "tbz x0, #0, 79f\n" "ld1 { v31.b }[2], [x6]\n" "b 79f\n" "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 79f\n" "ld1 { v31.b }[0], [x6]\n" "79:" // Oddments: Load (4, 2): Bit 2: End "ldr d2, [x23, #0x88]\n" "ssubl v31.8h, v31.8b, v9.8b\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ldr x27, [x20, #0xd8]\n" "smlal v6.4s, v31.4h, v1.4h\n" "smlal2 v5.4s, v31.8h, v1.8h\n" "add x27, x27, x24\n" "smlal v15.4s, v26.4h, v2.4h\n" "smlal2 v16.4s, v26.8h, v2.8h\n" "smlal v17.4s, v25.4h, v2.4h\n" "smlal2 v8.4s, v25.8h, v2.8h\n" "smlal v10.4s, v31.4h, v2.4h\n" "smlal2 v7.4s, v31.8h, v2.8h\n" "tbz x0, #2, 81f\n" "ld1 { v30.s }[0], [x27], #0x4\n" "tbz x0, #1, 80f\n" "ld1 { v30.h }[2], [x27], #0x2\n" "tbz x0, #0, 83f\n" "ld1 { v30.b }[6], [x27]\n" "b 83f\n" "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset "tbz x0, #0, 83f\n" "ld1 { v30.b }[4], [x27]\n" "b 83f\n" "81:" // Oddments: Load (4, 3): Bit 2: Unset "tbz x0, #1, 82f\n" "ld1 { v30.h }[0], [x27], #0x2\n" "tbz x0, #0, 83f\n" "ld1 { v30.b }[2], [x27]\n" "b 83f\n" "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 83f\n" "ld1 { v30.b }[0], [x27]\n" "83:" // Oddments: Load (4, 3): Bit 2: End "ldr d3, [x23, #0x90]\n" "ssubl v30.8h, v30.8b, v9.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ldr x11, [x20, #0xe0]\n" "smlal v6.4s, v30.4h, v2.4h\n" "smlal2 v5.4s, v30.8h, v2.8h\n" "add x11, x11, x24\n" "smlal v15.4s, v25.4h, v3.4h\n" "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal v17.4s, v24.4h, v3.4h\n" "smlal2 v8.4s, v24.8h, v3.8h\n" "smlal v10.4s, v30.4h, v3.4h\n" "smlal2 v7.4s, v30.8h, v3.8h\n" "tbz x0, #2, 85f\n" "ld1 { v28.s }[0], [x11], #0x4\n" "tbz x0, #1, 84f\n" "ld1 { v28.h }[2], [x11], #0x2\n" "tbz x0, #0, 87f\n" "ld1 { v28.b }[6], [x11]\n" "b 87f\n" "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset "tbz x0, #0, 87f\n" "ld1 { v28.b }[4], [x11]\n" "b 87f\n" "85:" // Oddments: Load (4, 4): Bit 2: Unset "tbz x0, #1, 86f\n" "ld1 { v28.h }[0], [x11], #0x2\n" "tbz x0, #0, 87f\n" "ld1 { v28.b }[2], [x11]\n" "b 87f\n" "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 87f\n" "ld1 { v28.b }[0], [x11]\n" "87:" // Oddments: Load (4, 4): Bit 2: End "ldr d4, [x23, #0x98]\n" "ssubl v28.8h, v28.8b, v9.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" "ldr x17, [x20, #0xe8]\n" "smlal v6.4s, v28.4h, v3.4h\n" "smlal2 v5.4s, v28.8h, v3.8h\n" "add x17, x17, x24\n" "smlal v15.4s, v24.4h, v4.4h\n" "smlal2 v16.4s, v24.8h, v4.8h\n" "smlal v17.4s, v22.4h, v4.4h\n" "smlal2 v8.4s, v22.8h, v4.8h\n" "smlal v10.4s, v28.4h, v4.4h\n" "smlal2 v7.4s, v28.8h, v4.8h\n" "tbz x0, #2, 89f\n" "ld1 { v26.s }[0], [x17], #0x4\n" "tbz x0, #1, 88f\n" "ld1 { v26.h }[2], [x17], #0x2\n" "tbz x0, #0, 91f\n" "ld1 { v26.b }[6], [x17]\n" "b 91f\n" "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset "tbz x0, #0, 91f\n" "ld1 { v26.b }[4], [x17]\n" "b 91f\n" "89:" // Oddments: Load (4, 5): Bit 2: Unset "tbz x0, #1, 90f\n" "ld1 { v26.h }[0], [x17], #0x2\n" "tbz x0, #0, 91f\n" "ld1 { v26.b }[2], [x17]\n" "b 91f\n" "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 91f\n" "ld1 { v26.b }[0], [x17]\n" "91:" // Oddments: Load (4, 5): Bit 2: End "ldr d0, [x23, #0xa0]\n" "ssubl v26.8h, v26.8b, v9.8b\n" "ssubl v0.8h, v0.8b, v14.8b\n" "ldr x5, [x20, #0xf0]\n" "smlal v6.4s, v26.4h, v4.4h\n" "smlal2 v5.4s, v26.8h, v4.8h\n" "add x5, x5, x24\n" "smlal v15.4s, v27.4h, v0.4h\n" "smlal2 v16.4s, v27.8h, v0.8h\n" "smlal v17.4s, v23.4h, v0.4h\n" "smlal2 v8.4s, v23.8h, v0.8h\n" "tbz x0, #2, 93f\n" "ld1 { v25.s }[0], [x5], #0x4\n" "tbz x0, #1, 92f\n" "ld1 { v25.h }[2], [x5], #0x2\n" "tbz x0, #0, 95f\n" "ld1 { v25.b }[6], [x5]\n" "b 95f\n" "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset "tbz x0, #0, 95f\n" "ld1 { v25.b }[4], [x5]\n" "b 95f\n" "93:" // Oddments: Load (5, 0): Bit 2: Unset "tbz x0, #1, 94f\n" "ld1 { v25.h }[0], [x5], #0x2\n" "tbz x0, #0, 95f\n" "ld1 { v25.b }[2], [x5]\n" "b 95f\n" "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 95f\n" "ld1 { v25.b }[0], [x5]\n" "95:" // Oddments: Load (5, 0): Bit 2: End "ssubl v25.8h, v25.8b, v9.8b\n" "ldr x25, [x20, #0xf8]\n" "smlal v10.4s, v25.4h, v0.4h\n" "smlal2 v7.4s, v25.8h, v0.8h\n" "add x25, x25, x24\n" "tbz x0, #2, 97f\n" "ld1 { v24.s }[0], [x25], #0x4\n" "tbz x0, #1, 96f\n" "ld1 { v24.h }[2], [x25], #0x2\n" "tbz x0, #0, 99f\n" "ld1 { v24.b }[6], [x25]\n" "b 99f\n" "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset "tbz x0, #0, 99f\n" "ld1 { v24.b }[4], [x25]\n" "b 99f\n" "97:" // Oddments: Load (5, 1): Bit 2: Unset "tbz x0, #1, 98f\n" "ld1 { v24.h }[0], [x25], #0x2\n" "tbz x0, #0, 99f\n" "ld1 { v24.b }[2], [x25]\n" "b 99f\n" "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 99f\n" "ld1 { v24.b }[0], [x25]\n" "99:" // Oddments: Load (5, 1): Bit 2: End "ldr d1, [x23, #0xa8]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "ssubl v1.8h, v1.8b, v14.8b\n" "ldr x26, [x20, #0x100]\n" "smlal v6.4s, v24.4h, v0.4h\n" "smlal2 v5.4s, v24.8h, v0.8h\n" "add x26, x26, x24\n" "smlal v15.4s, v23.4h, v1.4h\n" "smlal2 v16.4s, v23.8h, v1.8h\n" "smlal v17.4s, v31.4h, v1.4h\n" "smlal2 v8.4s, v31.8h, v1.8h\n" "smlal v10.4s, v24.4h, v1.4h\n" "smlal2 v7.4s, v24.8h, v1.8h\n" "tbz x0, #2, 101f\n" "ld1 { v27.s }[0], [x26], #0x4\n" "tbz x0, #1, 100f\n" "ld1 { v27.h }[2], [x26], #0x2\n" "tbz x0, #0, 103f\n" "ld1 { v27.b }[6], [x26]\n" "b 103f\n" "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset "tbz x0, #0, 103f\n" "ld1 { v27.b }[4], [x26]\n" "b 103f\n" "101:" // Oddments: Load (5, 2): Bit 2: Unset "tbz x0, #1, 102f\n" "ld1 { v27.h }[0], [x26], #0x2\n" "tbz x0, #0, 103f\n" "ld1 { v27.b }[2], [x26]\n" "b 103f\n" "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 103f\n" "ld1 { v27.b }[0], [x26]\n" "103:" // Oddments: Load (5, 2): Bit 2: End "ldr d2, [x23, #0xb0]\n" "ssubl v27.8h, v27.8b, v9.8b\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ldr x12, [x20, #0x108]\n" "smlal v6.4s, v27.4h, v1.4h\n" "smlal2 v5.4s, v27.8h, v1.8h\n" "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v2.4h\n" "smlal2 v16.4s, v31.8h, v2.8h\n" "smlal v17.4s, v30.4h, v2.4h\n" "smlal2 v8.4s, v30.8h, v2.8h\n" "smlal v10.4s, v27.4h, v2.4h\n" "smlal2 v7.4s, v27.8h, v2.8h\n" "tbz x0, #2, 105f\n" "ld1 { v25.s }[0], [x12], #0x4\n" "tbz x0, #1, 104f\n" "ld1 { v25.h }[2], [x12], #0x2\n" "tbz x0, #0, 107f\n" "ld1 { v25.b }[6], [x12]\n" "b 107f\n" "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset "tbz x0, #0, 107f\n" "ld1 { v25.b }[4], [x12]\n" "b 107f\n" "105:" // Oddments: Load (5, 3): Bit 2: Unset "tbz x0, #1, 106f\n" "ld1 { v25.h }[0], [x12], #0x2\n" "tbz x0, #0, 107f\n" "ld1 { v25.b }[2], [x12]\n" "b 107f\n" "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 107f\n" "ld1 { v25.b }[0], [x12]\n" "107:" // Oddments: Load (5, 3): Bit 2: End "ldr d3, [x23, #0xb8]\n" "ssubl v25.8h, v25.8b, v9.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ldr x14, [x20, #0x110]\n" "smlal v6.4s, v25.4h, v2.4h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" "add x14, x14, x24\n" "smlal v15.4s, v30.4h, v3.4h\n" "smlal2 v16.4s, v30.8h, v3.8h\n" "smlal v17.4s, v28.4h, v3.4h\n" "smlal2 v8.4s, v28.8h, v3.8h\n" "smlal v10.4s, v25.4h, v3.4h\n" "smlal2 v7.4s, v25.8h, v3.8h\n" "tbz x0, #2, 109f\n" "ld1 { v24.s }[0], [x14], #0x4\n" "tbz x0, #1, 108f\n" "ld1 { v24.h }[2], [x14], #0x2\n" "tbz x0, #0, 111f\n" "ld1 { v24.b }[6], [x14]\n" "b 111f\n" "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset "tbz x0, #0, 111f\n" "ld1 { v24.b }[4], [x14]\n" "b 111f\n" "109:" // Oddments: Load (5, 4): Bit 2: Unset "tbz x0, #1, 110f\n" "ld1 { v24.h }[0], [x14], #0x2\n" "tbz x0, #0, 111f\n" "ld1 { v24.b }[2], [x14]\n" "b 111f\n" "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 111f\n" "ld1 { v24.b }[0], [x14]\n" "111:" // Oddments: Load (5, 4): Bit 2: End "ldr d4, [x23, #0xc0]\n" "ssubl v24.8h, v24.8b, v9.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" "ldr x21, [x20, #0x118]\n" "smlal v6.4s, v24.4h, v3.4h\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "add x21, x21, x24\n" "smlal v15.4s, v28.4h, v4.4h\n" "smlal2 v16.4s, v28.8h, v4.8h\n" "smlal v17.4s, v26.4h, v4.4h\n" "smlal2 v8.4s, v26.8h, v4.8h\n" "smlal v10.4s, v24.4h, v4.4h\n" "smlal2 v7.4s, v24.8h, v4.8h\n" "tbz x0, #2, 113f\n" "ld1 { v27.s }[0], [x21], #0x4\n" "tbz x0, #1, 112f\n" "ld1 { v27.h }[2], [x21], #0x2\n" "tbz x0, #0, 115f\n" "ld1 { v27.b }[6], [x21]\n" "b 115f\n" "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset "tbz x0, #0, 115f\n" "ld1 { v27.b }[4], [x21]\n" "b 115f\n" "113:" // Oddments: Load (5, 5): Bit 2: Unset "tbz x0, #1, 114f\n" "ld1 { v27.h }[0], [x21], #0x2\n" "tbz x0, #0, 115f\n" "ld1 { v27.b }[2], [x21]\n" "b 115f\n" "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 115f\n" "ld1 { v27.b }[0], [x21]\n" "115:" // Oddments: Load (5, 5): Bit 2: End "ssubl v27.8h, v27.8b, v9.8b\n" "smlal v6.4s, v27.4h, v4.4h\n" "smlal2 v5.4s, v27.8h, v4.8h\n" "tbz x0, #2, 117f\n" "ld1 { v12.4s }, [x10], #0x10\n" "ld1 { v19.4s }, [x1], #0x10\n" "tbz x0, #1, 116f\n" "ld1 { v20.d }[0], [x10], #0x8\n" "ld1 { v29.d }[0], [x1], #0x8\n" "tbz x0, #0, 119f\n" "ld1 { v20.s }[2], [x10]\n" "ld1 { v29.s }[2], [x1]\n" "b 119f\n" "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset "tbz x0, #0, 119f\n" "ld1 { v20.s }[0], [x10]\n" "ld1 { v29.s }[0], [x1]\n" "b 119f\n" "117:" // Oddments: Load requant params: Bit 2: Unset "tbz x0, #1, 118f\n" "ld1 { v12.d }[0], [x10], #0x8\n" "ld1 { v19.d }[0], [x1], #0x8\n" "tbz x0, #0, 119f\n" "ld1 { v12.s }[2], [x10]\n" "ld1 { v19.s }[2], [x1]\n" "b 119f\n" "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 119f\n" "ld1 { v12.s }[0], [x10]\n" "ld1 { v19.s }[0], [x1]\n" "119:" // Oddments: Load requant params: Bit 2: End "sqrdmulh v15.4s, v15.4s, v12.4s\n" "sqrdmulh v17.4s, v17.4s, v12.4s\n" "add x16, x16, x22\n" "add x8, x8, x22\n" "sqrdmulh v10.4s, v10.4s, v12.4s\n" "sqrdmulh v6.4s, v6.4s, v12.4s\n" "add x4, x4, x22\n" "add x7, x7, x22\n" "and v23.16b, v15.16b, v19.16b\n" "sqrdmulh v16.4s, v16.4s, v20.4s\n" "and v22.16b, v17.16b, v19.16b\n" "sqrdmulh v8.4s, v8.4s, v20.4s\n" "and v21.16b, v10.16b, v19.16b\n" "sqrdmulh v7.4s, v7.4s, v20.4s\n" "and v26.16b, v6.16b, v19.16b\n" "sqrdmulh v5.4s, v5.4s, v20.4s\n" "sshr v23.4s, v23.4s, #0x1f\n" "and v4.16b, v16.16b, v29.16b\n" "sshr v22.4s, v22.4s, #0x1f\n" "and v2.16b, v8.16b, v29.16b\n" "sshr v21.4s, v21.4s, #0x1f\n" "and v3.16b, v7.16b, v29.16b\n" "sshr v26.4s, v26.4s, #0x1f\n" "and v25.16b, v5.16b, v29.16b\n" "sqadd v15.4s, v15.4s, v23.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" "sqadd v10.4s, v10.4s, v21.4s\n" "sshr v3.4s, v3.4s, #0x1f\n" "sqadd v6.4s, v6.4s, v26.4s\n" "sshr v25.4s, v25.4s, #0x1f\n" "srshl v15.4s, v15.4s, v19.4s\n" "sqadd v16.4s, v16.4s, v4.4s\n" "srshl v17.4s, v17.4s, v19.4s\n" "sqadd v8.4s, v8.4s, v2.4s\n" "srshl v10.4s, v10.4s, v19.4s\n" "sqadd v7.4s, v7.4s, v3.4s\n" "srshl v6.4s, v6.4s, v19.4s\n" "sqadd v5.4s, v5.4s, v25.4s\n" "srshl v16.4s, v16.4s, v29.4s\n" "sqxtn v15.4h, v15.4s\n" "srshl v8.4s, v8.4s, v29.4s\n" "sqxtn v17.4h, v17.4s\n" "srshl v7.4s, v7.4s, v29.4s\n" "sqxtn v10.4h, v10.4s\n" "srshl v5.4s, v5.4s, v29.4s\n" "sqxtn v6.4h, v6.4s\n" "sqxtn2 v15.8h, v16.4s\n" "sqxtn2 v17.8h, v8.4s\n" "sqxtn2 v10.8h, v7.4s\n" "sqxtn2 v6.8h, v5.4s\n" "sqadd v15.8h, v15.8h, v18.8h\n" "sqadd v17.8h, v17.8h, v18.8h\n" "sqadd v10.8h, v10.8h, v18.8h\n" "sqadd v6.8h, v6.8h, v18.8h\n" "smax v15.8h, v15.8h, v11.8h\n" "smax v17.8h, v17.8h, v11.8h\n" "smax v10.8h, v10.8h, v11.8h\n" "smax v6.8h, v6.8h, v11.8h\n" "smin v15.8h, v15.8h, v13.8h\n" "smin v17.8h, v17.8h, v13.8h\n" "smin v10.8h, v10.8h, v13.8h\n" "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" "uzp1 v17.16b, v17.16b, v17.16b\n" "uzp1 v10.16b, v10.16b, v10.16b\n" "uzp1 v6.16b, v6.16b, v6.16b\n" "tbz x0, #2, 121f\n" "st1 { v15.s }[0], [x16], #0x4\n" "st1 { v17.s }[0], [x8], #0x4\n" "st1 { v10.s }[0], [x4], #0x4\n" "st1 { v6.s }[0], [x7], #0x4\n" "tbz x0, #1, 120f\n" "st1 { v15.h }[2], [x16], #0x2\n" "st1 { v17.h }[2], [x8], #0x2\n" "st1 { v10.h }[2], [x4], #0x2\n" "st1 { v6.h }[2], [x7], #0x2\n" "tbz x0, #0, 123f\n" "st1 { v15.b }[6], [x16], #0x1\n" "st1 { v17.b }[6], [x8], #0x1\n" "st1 { v10.b }[6], [x4], #0x1\n" "st1 { v6.b }[6], [x7], #0x1\n" "b 123f\n" "120:" // Oddments: Bit 2: Bit 1: Unset "tbz x0, #0, 123f\n" "st1 { v15.b }[4], [x16], #0x1\n" "st1 { v17.b }[4], [x8], #0x1\n" "st1 { v10.b }[4], [x4], #0x1\n" "st1 { v6.b }[4], [x7], #0x1\n" "b 123f\n" "121:" // Oddments: Bit 2: Unset "tbz x0, #1, 122f\n" "st1 { v15.h }[0], [x16], #0x2\n" "st1 { v17.h }[0], [x8], #0x2\n" "st1 { v10.h }[0], [x4], #0x2\n" "st1 { v6.h }[0], [x7], #0x2\n" "tbz x0, #0, 123f\n" "st1 { v15.b }[2], [x16], #0x1\n" "st1 { v17.b }[2], [x8], #0x1\n" "st1 { v10.b }[2], [x4], #0x1\n" "st1 { v6.b }[2], [x7], #0x1\n" "b 123f\n" "122:" // Oddments: Bit 2: Unset: Bit 1: Unset "tbz x0, #0, 123f\n" "st1 { v15.b }[0], [x16], #0x1\n" "st1 { v17.b }[0], [x8], #0x1\n" "st1 { v10.b }[0], [x4], #0x1\n" "st1 { v6.b }[0], [x7], #0x1\n" "123:" // Oddments: Bit 2: End "124:" // End : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } } // namespace depthwise } // namespace arm_conv #endif // defined(__aarch64__)