/* * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(__aarch64__) #include "arm_gemm.hpp" #include namespace arm_conv { namespace depthwise { void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, // Bias, should be wrapped into the parameters const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, // Requant parameters, also wrapped int8_t *const *const outptrs ) { __asm__ __volatile__( "mov x20, #0x1\n" "orr x20, x20, #0x100\n" "ldp x15, x14, [%x[inptrs], #0x0]\n" "ldp x13, x12, [%x[inptrs], #0x10]\n" "orr x20, x20, #0x10000\n" "lsr x11, %x[n_channels], #0x4\n" "dup v14.4s, w20\n" "ldp x10, x9, [%x[inptrs], #0x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_minval]\n" "ld1r { v13.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" "ld1r { v12.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n" "ld1r { v11.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n" "ld1r { v10.4s }, [x20]\n" "mov x28, #0x0\n" "mov x27, #0x0\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "ldp x24, x23, [%x[outptrs], #0x0]\n" "ldp x22, x21, [%x[outptrs], #0x10]\n" "cbz x11, 3f\n" "ldr q9, [x15, x28]\n" "ldr q8, [x14, x28]\n" "subs x11, x11, #0x1\n" "ldr q7, [x13, x28]\n" "ldr q6, [x12, x28]\n" "zip2 v5.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "ldr q4, [x10, x28]\n" "ldr q3, [x9, x28]\n" "zip1 v7.16b, v8.16b, v6.16b\n" "zip2 v6.16b, v8.16b, v6.16b\n" "ldr q2, [x26, x28]\n" "ldr q1, [x25, x28]\n" "zip2 v8.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "ldr q0, [%x[params], #0x10]\n" "ldr q16, [%x[params], #0x20]\n" "zip1 v7.16b, v5.16b, v6.16b\n" "zip2 v6.16b, v5.16b, v6.16b\n" "ldr q5, [%x[params], #0x0]\n" "ldr q31, [%x[params], #0x30]\n" "zip2 v30.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "ldp x15, x14, [%x[inptrs], #0x40]\n" "ldr q29, [x15, x28]\n" "zip1 v2.16b, v3.16b, v1.16b\n" "zip2 v1.16b, v3.16b, v1.16b\n" "ldr q28, [x14, x28]\n" "ldp x13, x12, [%x[inptrs], #0x50]\n" "zip2 v3.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "ldr q27, [x13, x28]\n" "ldr q26, [x12, x28]\n" "zip2 v25.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "ldp x10, x9, [%x[inptrs], #0x60]\n" "ldr q24, [x10, x28]\n" "zip1 v27.16b, v28.16b, v26.16b\n" "zip2 v26.16b, v28.16b, v26.16b\n" "ldr q23, [x9, x28]\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "zip1 v2.16b, v30.16b, v1.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "ldr q22, [x26, x28]\n" "ldr q21, [x25, x28]\n" "zip2 v20.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v23.16b, v21.16b\n" "zip2 v21.16b, v23.16b, v21.16b\n" "ldp x15, x14, [%x[inptrs], #0x0]\n" "ldp x13, x12, [%x[inptrs], #0x10]\n" "ldp x10, x9, [%x[inptrs], #0x20]\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "zip2 v28.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v25.16b, v26.16b\n" "zip2 v26.16b, v25.16b, v26.16b\n" "add %x[params], %x[params], #0x40\n" "zip2 v23.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v20.16b, v21.16b\n" "zip2 v21.16b, v20.16b, v21.16b\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" "beq 2f\n" "1:" // Loop "movi v19.4s, #0x0\n" ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n" ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n" "add x28, x28, #0x10\n" ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n" ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n" "subs x11, x11, #0x1\n" ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n" ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n" "ext v9.16b, v9.16b, v9.16b, #0x1\n" ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n" ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n" ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n" "movi v17.4s, #0x0\n" ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n" ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n" ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n" "ldr q4, [%x[params], #0x10]\n" ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n" "mls v5.4s, v19.4s, v11.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n" ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n" "ldr q9, [%x[params], #0x0]\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n" ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x60]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x40]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x50]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x30]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x70]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n" ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x20]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n" ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n" ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n" "ext v8.16b, v8.16b, v8.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n" ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n" ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n" ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n" ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n" "ldr q3, [x9, x28]\n" ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n" ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n" ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n" ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n" "ldr q8, [x14, x28]\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0xc0]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0xa0]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0xb0]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x90]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0xd0]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n" ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x80]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n" ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n" ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n" ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n" ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n" ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n" ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n" "ldr q2, [x26, x28]\n" ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n" ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n" ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n" ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n" "ldr q7, [x13, x28]\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x120]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x100]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x110]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0xf0]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x130]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n" ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0xe0]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n" ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n" ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n" ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n" ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n" ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n" ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n" "ext v26.16b, v26.16b, v26.16b, #0x1\n" ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n" "ldr q1, [x25, x28]\n" ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n" ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n" ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n" ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n" "ldr q6, [x12, x28]\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [x15, x28]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "ldp x15, x14, [%x[inptrs], #0x40]\n" "ldr q29, [x15, x28]\n" "ldr q28, [x14, x28]\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "ldp x13, x12, [%x[inptrs], #0x50]\n" "ldr q27, [x13, x28]\n" "ldr q26, [x12, x28]\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x160]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x170]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x150]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [x10, x28]\n" "ldp x10, x9, [%x[inptrs], #0x60]\n" "ldr q24, [x10, x28]\n" "ldr q23, [x9, x28]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "ldr q22, [x26, x28]\n" "ldr q21, [x25, x28]\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "ldp x15, x14, [%x[inptrs], #0x0]\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "ldp x13, x12, [%x[inptrs], #0x10]\n" "ldp x10, x9, [%x[inptrs], #0x20]\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "zip2 v5.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v8.16b, v6.16b\n" "zip2 v6.16b, v8.16b, v6.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "zip2 v8.16b, v9.16b, v7.16b\n" "str s20, [x21, x27]\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v5.16b, v6.16b\n" "add x27, x27, #0x4\n" "zip2 v6.16b, v5.16b, v6.16b\n" "ldr q5, [%x[params], #0x140]\n" "zip2 v30.16b, v4.16b, v2.16b\n" "add %x[params], %x[params], #0x180\n" "zip1 v4.16b, v4.16b, v2.16b\n" "zip1 v2.16b, v3.16b, v1.16b\n" "zip2 v1.16b, v3.16b, v1.16b\n" "zip2 v25.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v28.16b, v26.16b\n" "zip2 v26.16b, v28.16b, v26.16b\n" "zip2 v20.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v23.16b, v21.16b\n" "zip2 v21.16b, v23.16b, v21.16b\n" "zip2 v3.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "zip1 v2.16b, v30.16b, v1.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "zip2 v28.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v25.16b, v26.16b\n" "zip2 v26.16b, v25.16b, v26.16b\n" "zip2 v23.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v20.16b, v21.16b\n" "zip2 v21.16b, v20.16b, v21.16b\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" "bgt 1b\n" "2:" // Detached iteration "movi v19.4s, #0x0\n" ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n" ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n" "tst %x[n_channels], #0xf\n" ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n" ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n" "add x28, x28, #0x10\n" ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n" ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n" "ext v9.16b, v9.16b, v9.16b, #0x1\n" ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n" ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n" ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n" "movi v17.4s, #0x0\n" ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n" ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n" ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n" "ldr q4, [%x[params], #0x10]\n" ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n" "mls v5.4s, v19.4s, v11.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n" ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n" "ldr q9, [%x[params], #0x0]\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n" ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x60]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x40]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x50]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x30]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x70]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n" ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x20]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n" ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n" ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n" "ext v8.16b, v8.16b, v8.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n" ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n" ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n" ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n" ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n" ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n" ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n" ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n" ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0xc0]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0xa0]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0xb0]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x90]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0xd0]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n" ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x80]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n" ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n" ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n" ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n" ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n" ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n" ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n" ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n" ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n" ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n" ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x120]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x100]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x110]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0xf0]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x130]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n" ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0xe0]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n" "add %x[params], %x[params], #0x140\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n" ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n" ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n" ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n" ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n" ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n" ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n" "ext v26.16b, v26.16b, v26.16b, #0x1\n" ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n" ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n" ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n" ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n" ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "str s5, [x24, x27]\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "add x27, x27, #0x4\n" "beq 35f\n" "3:" // Oddments "and x20, %x[n_channels], #0xf\n" "add x15, x15, x28\n" "add x14, x14, x28\n" "add x13, x13, x28\n" "add x12, x12, x28\n" "add x10, x10, x28\n" "add x9, x9, x28\n" "add x26, x26, x28\n" "add x25, x25, x28\n" "tbz %x[n_channels], #3, 7f\n" "ldr d9, [x15], #0x8\n" "ldr d8, [x14], #0x8\n" "ldr d7, [x13], #0x8\n" "ldr d6, [x12], #0x8\n" "ldr d4, [x10], #0x8\n" "ldr d3, [x9], #0x8\n" "ldr d2, [x26], #0x8\n" "ldr d1, [x25], #0x8\n" "tbz %x[n_channels], #2, 5f\n" "ld1 { v9.s }[2], [x15], #0x4\n" "ld1 { v8.s }[2], [x14], #0x4\n" "ld1 { v7.s }[2], [x13], #0x4\n" "ld1 { v6.s }[2], [x12], #0x4\n" "ld1 { v4.s }[2], [x10], #0x4\n" "ld1 { v3.s }[2], [x9], #0x4\n" "ld1 { v2.s }[2], [x26], #0x4\n" "ld1 { v1.s }[2], [x25], #0x4\n" "tbz %x[n_channels], #1, 4f\n" "ld1 { v9.h }[6], [x15], #0x2\n" "ld1 { v8.h }[6], [x14], #0x2\n" "ld1 { v7.h }[6], [x13], #0x2\n" "ld1 { v6.h }[6], [x12], #0x2\n" "ld1 { v4.h }[6], [x10], #0x2\n" "ld1 { v3.h }[6], [x9], #0x2\n" "ld1 { v2.h }[6], [x26], #0x2\n" "ld1 { v1.h }[6], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[14], [x15], #0x1\n" "ld1 { v8.b }[14], [x14], #0x1\n" "ld1 { v7.b }[14], [x13], #0x1\n" "ld1 { v6.b }[14], [x12], #0x1\n" "ld1 { v4.b }[14], [x10], #0x1\n" "ld1 { v3.b }[14], [x9], #0x1\n" "ld1 { v2.b }[14], [x26], #0x1\n" "ld1 { v1.b }[14], [x25], #0x1\n" "b 11f\n" "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[12], [x15], #0x1\n" "ld1 { v8.b }[12], [x14], #0x1\n" "ld1 { v7.b }[12], [x13], #0x1\n" "ld1 { v6.b }[12], [x12], #0x1\n" "ld1 { v4.b }[12], [x10], #0x1\n" "ld1 { v3.b }[12], [x9], #0x1\n" "ld1 { v2.b }[12], [x26], #0x1\n" "ld1 { v1.b }[12], [x25], #0x1\n" "b 11f\n" "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 6f\n" "ld1 { v9.h }[4], [x15], #0x2\n" "ld1 { v8.h }[4], [x14], #0x2\n" "ld1 { v7.h }[4], [x13], #0x2\n" "ld1 { v6.h }[4], [x12], #0x2\n" "ld1 { v4.h }[4], [x10], #0x2\n" "ld1 { v3.h }[4], [x9], #0x2\n" "ld1 { v2.h }[4], [x26], #0x2\n" "ld1 { v1.h }[4], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[10], [x15], #0x1\n" "ld1 { v8.b }[10], [x14], #0x1\n" "ld1 { v7.b }[10], [x13], #0x1\n" "ld1 { v6.b }[10], [x12], #0x1\n" "ld1 { v4.b }[10], [x10], #0x1\n" "ld1 { v3.b }[10], [x9], #0x1\n" "ld1 { v2.b }[10], [x26], #0x1\n" "ld1 { v1.b }[10], [x25], #0x1\n" "b 11f\n" "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[8], [x15], #0x1\n" "ld1 { v8.b }[8], [x14], #0x1\n" "ld1 { v7.b }[8], [x13], #0x1\n" "ld1 { v6.b }[8], [x12], #0x1\n" "ld1 { v4.b }[8], [x10], #0x1\n" "ld1 { v3.b }[8], [x9], #0x1\n" "ld1 { v2.b }[8], [x26], #0x1\n" "ld1 { v1.b }[8], [x25], #0x1\n" "b 11f\n" "7:" // Oddments: Load (A): Bit 3: Unset "tbz %x[n_channels], #2, 9f\n" "ldr s9, [x15], #0x4\n" "ldr s8, [x14], #0x4\n" "ldr s7, [x13], #0x4\n" "ldr s6, [x12], #0x4\n" "ldr s4, [x10], #0x4\n" "ldr s3, [x9], #0x4\n" "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "tbz %x[n_channels], #1, 8f\n" "ld1 { v9.h }[2], [x15], #0x2\n" "ld1 { v8.h }[2], [x14], #0x2\n" "ld1 { v7.h }[2], [x13], #0x2\n" "ld1 { v6.h }[2], [x12], #0x2\n" "ld1 { v4.h }[2], [x10], #0x2\n" "ld1 { v3.h }[2], [x9], #0x2\n" "ld1 { v2.h }[2], [x26], #0x2\n" "ld1 { v1.h }[2], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[6], [x15], #0x1\n" "ld1 { v8.b }[6], [x14], #0x1\n" "ld1 { v7.b }[6], [x13], #0x1\n" "ld1 { v6.b }[6], [x12], #0x1\n" "ld1 { v4.b }[6], [x10], #0x1\n" "ld1 { v3.b }[6], [x9], #0x1\n" "ld1 { v2.b }[6], [x26], #0x1\n" "ld1 { v1.b }[6], [x25], #0x1\n" "b 11f\n" "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[4], [x15], #0x1\n" "ld1 { v8.b }[4], [x14], #0x1\n" "ld1 { v7.b }[4], [x13], #0x1\n" "ld1 { v6.b }[4], [x12], #0x1\n" "ld1 { v4.b }[4], [x10], #0x1\n" "ld1 { v3.b }[4], [x9], #0x1\n" "ld1 { v2.b }[4], [x26], #0x1\n" "ld1 { v1.b }[4], [x25], #0x1\n" "b 11f\n" "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 10f\n" "ldr h9, [x15], #0x2\n" "ldr h8, [x14], #0x2\n" "ldr h7, [x13], #0x2\n" "ldr h6, [x12], #0x2\n" "ldr h4, [x10], #0x2\n" "ldr h3, [x9], #0x2\n" "ldr h2, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[2], [x15], #0x1\n" "ld1 { v8.b }[2], [x14], #0x1\n" "ld1 { v7.b }[2], [x13], #0x1\n" "ld1 { v6.b }[2], [x12], #0x1\n" "ld1 { v4.b }[2], [x10], #0x1\n" "ld1 { v3.b }[2], [x9], #0x1\n" "ld1 { v2.b }[2], [x26], #0x1\n" "ld1 { v1.b }[2], [x25], #0x1\n" "b 11f\n" "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b9, [x15], #0x1\n" "ldr b8, [x14], #0x1\n" "ldr b7, [x13], #0x1\n" "ldr b6, [x12], #0x1\n" "ldr b4, [x10], #0x1\n" "ldr b3, [x9], #0x1\n" "ldr b2, [x26], #0x1\n" "ldr b1, [x25], #0x1\n" "11:" // Oddments: Load (A): Bit 3: End "ldp x15, x14, [%x[inptrs], #0x40]\n" "ldp x13, x12, [%x[inptrs], #0x50]\n" "add x15, x15, x28\n" "add x14, x14, x28\n" "ldp x10, x9, [%x[inptrs], #0x60]\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "add x13, x13, x28\n" "add x12, x12, x28\n" "add x10, x10, x28\n" "add x9, x9, x28\n" "add x26, x26, x28\n" "add x25, x25, x28\n" "tbz %x[n_channels], #3, 15f\n" "ldr d29, [x15], #0x8\n" "ldr d28, [x14], #0x8\n" "ldr d27, [x13], #0x8\n" "ldr d26, [x12], #0x8\n" "ldr d24, [x10], #0x8\n" "ldr d23, [x9], #0x8\n" "ldr d22, [x26], #0x8\n" "ldr d21, [x25], #0x8\n" "tbz %x[n_channels], #2, 13f\n" "ld1 { v29.s }[2], [x15], #0x4\n" "ld1 { v28.s }[2], [x14], #0x4\n" "ld1 { v27.s }[2], [x13], #0x4\n" "ld1 { v26.s }[2], [x12], #0x4\n" "ld1 { v24.s }[2], [x10], #0x4\n" "ld1 { v23.s }[2], [x9], #0x4\n" "ld1 { v22.s }[2], [x26], #0x4\n" "ld1 { v21.s }[2], [x25], #0x4\n" "tbz %x[n_channels], #1, 12f\n" "ld1 { v29.h }[6], [x15], #0x2\n" "ld1 { v28.h }[6], [x14], #0x2\n" "ld1 { v27.h }[6], [x13], #0x2\n" "ld1 { v26.h }[6], [x12], #0x2\n" "ld1 { v24.h }[6], [x10], #0x2\n" "ld1 { v23.h }[6], [x9], #0x2\n" "ld1 { v22.h }[6], [x26], #0x2\n" "ld1 { v21.h }[6], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[14], [x15], #0x1\n" "ld1 { v28.b }[14], [x14], #0x1\n" "ld1 { v27.b }[14], [x13], #0x1\n" "ld1 { v26.b }[14], [x12], #0x1\n" "ld1 { v24.b }[14], [x10], #0x1\n" "ld1 { v23.b }[14], [x9], #0x1\n" "ld1 { v22.b }[14], [x26], #0x1\n" "ld1 { v21.b }[14], [x25], #0x1\n" "b 19f\n" "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[12], [x15], #0x1\n" "ld1 { v28.b }[12], [x14], #0x1\n" "ld1 { v27.b }[12], [x13], #0x1\n" "ld1 { v26.b }[12], [x12], #0x1\n" "ld1 { v24.b }[12], [x10], #0x1\n" "ld1 { v23.b }[12], [x9], #0x1\n" "ld1 { v22.b }[12], [x26], #0x1\n" "ld1 { v21.b }[12], [x25], #0x1\n" "b 19f\n" "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 14f\n" "ld1 { v29.h }[4], [x15], #0x2\n" "ld1 { v28.h }[4], [x14], #0x2\n" "ld1 { v27.h }[4], [x13], #0x2\n" "ld1 { v26.h }[4], [x12], #0x2\n" "ld1 { v24.h }[4], [x10], #0x2\n" "ld1 { v23.h }[4], [x9], #0x2\n" "ld1 { v22.h }[4], [x26], #0x2\n" "ld1 { v21.h }[4], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[10], [x15], #0x1\n" "ld1 { v28.b }[10], [x14], #0x1\n" "ld1 { v27.b }[10], [x13], #0x1\n" "ld1 { v26.b }[10], [x12], #0x1\n" "ld1 { v24.b }[10], [x10], #0x1\n" "ld1 { v23.b }[10], [x9], #0x1\n" "ld1 { v22.b }[10], [x26], #0x1\n" "ld1 { v21.b }[10], [x25], #0x1\n" "b 19f\n" "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[8], [x15], #0x1\n" "ld1 { v28.b }[8], [x14], #0x1\n" "ld1 { v27.b }[8], [x13], #0x1\n" "ld1 { v26.b }[8], [x12], #0x1\n" "ld1 { v24.b }[8], [x10], #0x1\n" "ld1 { v23.b }[8], [x9], #0x1\n" "ld1 { v22.b }[8], [x26], #0x1\n" "ld1 { v21.b }[8], [x25], #0x1\n" "b 19f\n" "15:" // Oddments: Load (B): Bit 3: Unset "tbz %x[n_channels], #2, 17f\n" "ldr s29, [x15], #0x4\n" "ldr s28, [x14], #0x4\n" "ldr s27, [x13], #0x4\n" "ldr s26, [x12], #0x4\n" "ldr s24, [x10], #0x4\n" "ldr s23, [x9], #0x4\n" "ldr s22, [x26], #0x4\n" "ldr s21, [x25], #0x4\n" "tbz %x[n_channels], #1, 16f\n" "ld1 { v29.h }[2], [x15], #0x2\n" "ld1 { v28.h }[2], [x14], #0x2\n" "ld1 { v27.h }[2], [x13], #0x2\n" "ld1 { v26.h }[2], [x12], #0x2\n" "ld1 { v24.h }[2], [x10], #0x2\n" "ld1 { v23.h }[2], [x9], #0x2\n" "ld1 { v22.h }[2], [x26], #0x2\n" "ld1 { v21.h }[2], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[6], [x15], #0x1\n" "ld1 { v28.b }[6], [x14], #0x1\n" "ld1 { v27.b }[6], [x13], #0x1\n" "ld1 { v26.b }[6], [x12], #0x1\n" "ld1 { v24.b }[6], [x10], #0x1\n" "ld1 { v23.b }[6], [x9], #0x1\n" "ld1 { v22.b }[6], [x26], #0x1\n" "ld1 { v21.b }[6], [x25], #0x1\n" "b 19f\n" "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[4], [x15], #0x1\n" "ld1 { v28.b }[4], [x14], #0x1\n" "ld1 { v27.b }[4], [x13], #0x1\n" "ld1 { v26.b }[4], [x12], #0x1\n" "ld1 { v24.b }[4], [x10], #0x1\n" "ld1 { v23.b }[4], [x9], #0x1\n" "ld1 { v22.b }[4], [x26], #0x1\n" "ld1 { v21.b }[4], [x25], #0x1\n" "b 19f\n" "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 18f\n" "ldr h29, [x15], #0x2\n" "ldr h28, [x14], #0x2\n" "ldr h27, [x13], #0x2\n" "ldr h26, [x12], #0x2\n" "ldr h24, [x10], #0x2\n" "ldr h23, [x9], #0x2\n" "ldr h22, [x26], #0x2\n" "ldr h21, [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[2], [x15], #0x1\n" "ld1 { v28.b }[2], [x14], #0x1\n" "ld1 { v27.b }[2], [x13], #0x1\n" "ld1 { v26.b }[2], [x12], #0x1\n" "ld1 { v24.b }[2], [x10], #0x1\n" "ld1 { v23.b }[2], [x9], #0x1\n" "ld1 { v22.b }[2], [x26], #0x1\n" "ld1 { v21.b }[2], [x25], #0x1\n" "b 19f\n" "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b29, [x15], #0x1\n" "ldr b28, [x14], #0x1\n" "ldr b27, [x13], #0x1\n" "ldr b26, [x12], #0x1\n" "ldr b24, [x10], #0x1\n" "ldr b23, [x9], #0x1\n" "ldr b22, [x26], #0x1\n" "ldr b21, [x25], #0x1\n" "19:" // Oddments: Load (B): Bit 3: End "ldr q0, [%x[params], #0x10]\n" "ldr q16, [%x[params], #0x20]\n" "zip2 v30.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "ldr q31, [%x[params], #0x30]\n" "zip1 v2.16b, v3.16b, v1.16b\n" "zip2 v5.16b, v9.16b, v7.16b\n" "cmp x20, #0x4\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v8.16b, v6.16b\n" "zip2 v6.16b, v8.16b, v6.16b\n" "zip2 v1.16b, v3.16b, v1.16b\n" "zip2 v3.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "zip2 v25.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v28.16b, v26.16b\n" "movi v19.4s, #0x0\n" ".inst 0x4e8495d3 // sdot v19.4s, v14.16b, v4.16b\n" "zip2 v8.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v5.16b, v6.16b\n" "zip2 v6.16b, v5.16b, v6.16b\n" "ldr q5, [%x[params], #0x0]\n" "zip2 v26.16b, v28.16b, v26.16b\n" "zip2 v20.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v23.16b, v21.16b\n" "zip2 v21.16b, v23.16b, v21.16b\n" "zip2 v28.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v2.16b, v30.16b, v1.16b\n" ".inst 0x4e9d95d3 // sdot v19.4s, v14.16b, v29.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "zip1 v27.16b, v25.16b, v26.16b\n" "zip2 v26.16b, v25.16b, v26.16b\n" "zip2 v23.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v20.16b, v21.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9895d2 // sdot v18.4s, v14.16b, v24.16b\n" "zip2 v21.16b, v20.16b, v21.16b\n" "mov v30.16b, v5.16b\n" ".inst 0x4e8995d3 // sdot v19.4s, v14.16b, v9.16b\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x4e899405 // sdot v5.4s, v0.16b, v9.16b\n" ".inst 0x4e849419 // sdot v25.4s, v0.16b, v4.16b\n" ".inst 0x4e849605 // sdot v5.4s, v16.16b, v4.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "ext v9.16b, v9.16b, v9.16b, #0x1\n" ".inst 0x4e9d9619 // sdot v25.4s, v16.16b, v29.16b\n" ".inst 0x4e9d97e5 // sdot v5.4s, v31.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x4e89941e // sdot v30.4s, v0.16b, v9.16b\n" ".inst 0x4e849414 // sdot v20.4s, v0.16b, v4.16b\n" "movi v17.4s, #0x0\n" ".inst 0x4e8495d1 // sdot v17.4s, v14.16b, v4.16b\n" ".inst 0x4e9d95d1 // sdot v17.4s, v14.16b, v29.16b\n" ".inst 0x4e9897f9 // sdot v25.4s, v31.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x4e84961e // sdot v30.4s, v16.16b, v4.16b\n" "ldr q4, [%x[params], #0x50]\n" ".inst 0x4e9d9614 // sdot v20.4s, v16.16b, v29.16b\n" "mov v16.16b, v17.16b\n .inst 0x4e9895d0 // sdot v16.4s, v14.16b, v24.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e8995d1 // sdot v17.4s, v14.16b, v9.16b\n" "ldr q9, [%x[params], #0x40]\n" ".inst 0x4e9d97fe // sdot v30.4s, v31.16b, v29.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" ".inst 0x4e9897f4 // sdot v20.4s, v31.16b, v24.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "add %x[params], %x[params], #0x60\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "blt 20f\n" "str s5, [x24, x27]\n" "str s30, [x23, x27]\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "b 23f\n" "20:" // Oddments: Unroll 0: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 21f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 22f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 22f\n" "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End "23:" // Oddments: Unroll 0: After oddment store "subs x20, x20, #0x4\n" "add x27, x27, #0x4\n" "ble 35f\n" "ldr q5, [%x[params], #0x0]\n" "ldr q0, [%x[params], #0x10]\n" "movi v19.4s, #0x0\n" ".inst 0x4e8395d3 // sdot v19.4s, v14.16b, v3.16b\n" "ldr q16, [%x[params], #0x20]\n" "ldr q31, [%x[params], #0x30]\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "ldr q9, [%x[params], #0x40]\n" "ldr q4, [%x[params], #0x50]\n" "mov v20.16b, v5.16b\n" ".inst 0x4e889405 // sdot v5.4s, v0.16b, v8.16b\n" ".inst 0x4e9c95d3 // sdot v19.4s, v14.16b, v28.16b\n" ".inst 0x4e839419 // sdot v25.4s, v0.16b, v3.16b\n" "movi v17.4s, #0x0\n" "cmp x20, #0x4\n" ".inst 0x4e839605 // sdot v5.4s, v16.16b, v3.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9795d2 // sdot v18.4s, v14.16b, v23.16b\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e8895d3 // sdot v19.4s, v14.16b, v8.16b\n" "ext v8.16b, v8.16b, v8.16b, #0x1\n" ".inst 0x4e88941e // sdot v30.4s, v0.16b, v8.16b\n" ".inst 0x4e839414 // sdot v20.4s, v0.16b, v3.16b\n" ".inst 0x4e8395d1 // sdot v17.4s, v14.16b, v3.16b\n" ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n" ".inst 0x4e9c97e5 // sdot v5.4s, v31.16b, v28.16b\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x4e83961e // sdot v30.4s, v16.16b, v3.16b\n" ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9c95d1 // sdot v17.4s, v14.16b, v28.16b\n" ".inst 0x4e9797f9 // sdot v25.4s, v31.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x4e9c97fe // sdot v30.4s, v31.16b, v28.16b\n" ".inst 0x4e9797f4 // sdot v20.4s, v31.16b, v23.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9795d0 // sdot v16.4s, v14.16b, v23.16b\n" ".inst 0x4e8895d1 // sdot v17.4s, v14.16b, v8.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "blt 24f\n" "str s5, [x24, x27]\n" "str s30, [x23, x27]\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "b 27f\n" "24:" // Oddments: Unroll 1: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 25f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 26f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 26f\n" "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End "27:" // Oddments: Unroll 1: After oddment store "subs x20, x20, #0x4\n" "add x27, x27, #0x4\n" "ble 35f\n" "ldr q5, [%x[params], #0x0]\n" "ldr q0, [%x[params], #0x10]\n" "movi v19.4s, #0x0\n" ".inst 0x4e8295d3 // sdot v19.4s, v14.16b, v2.16b\n" "ldr q16, [%x[params], #0x20]\n" "ldr q31, [%x[params], #0x30]\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "ldr q9, [%x[params], #0x40]\n" "ldr q4, [%x[params], #0x50]\n" "mov v20.16b, v5.16b\n" ".inst 0x4e879405 // sdot v5.4s, v0.16b, v7.16b\n" ".inst 0x4e9b95d3 // sdot v19.4s, v14.16b, v27.16b\n" ".inst 0x4e829419 // sdot v25.4s, v0.16b, v2.16b\n" "movi v17.4s, #0x0\n" "cmp x20, #0x4\n" ".inst 0x4e829605 // sdot v5.4s, v16.16b, v2.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9695d2 // sdot v18.4s, v14.16b, v22.16b\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e8795d3 // sdot v19.4s, v14.16b, v7.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" ".inst 0x4e87941e // sdot v30.4s, v0.16b, v7.16b\n" ".inst 0x4e829414 // sdot v20.4s, v0.16b, v2.16b\n" ".inst 0x4e8295d1 // sdot v17.4s, v14.16b, v2.16b\n" ".inst 0x4e9b9619 // sdot v25.4s, v16.16b, v27.16b\n" ".inst 0x4e9b97e5 // sdot v5.4s, v31.16b, v27.16b\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x4e82961e // sdot v30.4s, v16.16b, v2.16b\n" ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9b95d1 // sdot v17.4s, v14.16b, v27.16b\n" ".inst 0x4e9697f9 // sdot v25.4s, v31.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x4e9b97fe // sdot v30.4s, v31.16b, v27.16b\n" ".inst 0x4e9697f4 // sdot v20.4s, v31.16b, v22.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9695d0 // sdot v16.4s, v14.16b, v22.16b\n" ".inst 0x4e8795d1 // sdot v17.4s, v14.16b, v7.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "blt 28f\n" "str s5, [x24, x27]\n" "str s30, [x23, x27]\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "b 31f\n" "28:" // Oddments: Unroll 2: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 29f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 30f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 30f\n" "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End "31:" // Oddments: Unroll 2: After oddment store "subs x20, x20, #0x4\n" "add x27, x27, #0x4\n" "ble 35f\n" "ldr q5, [%x[params], #0x0]\n" "ldr q0, [%x[params], #0x10]\n" "movi v19.4s, #0x0\n" ".inst 0x4e8195d3 // sdot v19.4s, v14.16b, v1.16b\n" "ldr q16, [%x[params], #0x20]\n" "ldr q31, [%x[params], #0x30]\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "ldr q9, [%x[params], #0x40]\n" "ldr q4, [%x[params], #0x50]\n" "mov v20.16b, v5.16b\n" ".inst 0x4e869405 // sdot v5.4s, v0.16b, v6.16b\n" ".inst 0x4e9a95d3 // sdot v19.4s, v14.16b, v26.16b\n" ".inst 0x4e819419 // sdot v25.4s, v0.16b, v1.16b\n" "movi v17.4s, #0x0\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e819605 // sdot v5.4s, v16.16b, v1.16b\n" "mov v18.16b, v19.16b\n .inst 0x4e9595d2 // sdot v18.4s, v14.16b, v21.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x4e8695d3 // sdot v19.4s, v14.16b, v6.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" ".inst 0x4e86941e // sdot v30.4s, v0.16b, v6.16b\n" ".inst 0x4e819414 // sdot v20.4s, v0.16b, v1.16b\n" ".inst 0x4e8195d1 // sdot v17.4s, v14.16b, v1.16b\n" ".inst 0x4e9a9619 // sdot v25.4s, v16.16b, v26.16b\n" ".inst 0x4e9a97e5 // sdot v5.4s, v31.16b, v26.16b\n" "ext v26.16b, v26.16b, v26.16b, #0x1\n" ".inst 0x4e81961e // sdot v30.4s, v16.16b, v1.16b\n" ".inst 0x4e9a9614 // sdot v20.4s, v16.16b, v26.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x4e9a95d1 // sdot v17.4s, v14.16b, v26.16b\n" ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x4e9a97fe // sdot v30.4s, v31.16b, v26.16b\n" ".inst 0x4e9597f4 // sdot v20.4s, v31.16b, v21.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x4e9595d0 // sdot v16.4s, v14.16b, v21.16b\n" ".inst 0x4e8695d1 // sdot v17.4s, v14.16b, v6.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "32:" // Oddments: Unroll 3: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 33f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 34f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 34f\n" "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End "35:" // End : [params] "+&r" (params) : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } } // namespace depthwise } // namespace arm_conv #endif // defined(__aarch64__)