/* * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(__aarch64__) #include "arm_gemm.hpp" #include namespace arm_conv { namespace depthwise { void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, // Bias, should be wrapped into the parameters const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, // Requant parameters, also wrapped uint8_t *const *const outptrs ) { __asm__ __volatile__( "mov x20, #0x1\n" "orr x20, x20, #0x100\n" "ldp x15, x14, [%x[inptrs], #0x0]\n" "ldp x13, x12, [%x[inptrs], #0x10]\n" "orr x20, x20, #0x10000\n" "lsr x11, %x[n_channels], #0x4\n" "dup v14.4s, w20\n" "ldp x10, x9, [%x[inptrs], #0x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_minval]\n" "ld1r { v13.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" "ld1r { v12.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n" "ld1r { v11.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n" "ld1r { v10.4s }, [x20]\n" "mov x28, #0x0\n" "mov x27, #0x0\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "ldp x24, x23, [%x[outptrs], #0x0]\n" "ldp x22, x21, [%x[outptrs], #0x10]\n" "cbz x11, 3f\n" "ldr q9, [x15, x28]\n" "ldr q8, [x14, x28]\n" "subs x11, x11, #0x1\n" "ldr q7, [x13, x28]\n" "ldr q6, [x12, x28]\n" "zip2 v5.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "ldr q4, [x10, x28]\n" "ldr q3, [x9, x28]\n" "zip1 v7.16b, v8.16b, v6.16b\n" "zip2 v6.16b, v8.16b, v6.16b\n" "ldr q2, [x26, x28]\n" "ldr q1, [x25, x28]\n" "zip2 v8.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "ldr q0, [%x[params], #0x10]\n" "ldr q16, [%x[params], #0x20]\n" "zip1 v7.16b, v5.16b, v6.16b\n" "zip2 v6.16b, v5.16b, v6.16b\n" "ldr q5, [%x[params], #0x0]\n" "ldr q31, [%x[params], #0x30]\n" "zip2 v30.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "ldp x15, x14, [%x[inptrs], #0x40]\n" "ldr q29, [x15, x28]\n" "zip1 v2.16b, v3.16b, v1.16b\n" "zip2 v1.16b, v3.16b, v1.16b\n" "ldr q28, [x14, x28]\n" "ldp x13, x12, [%x[inptrs], #0x50]\n" "zip2 v3.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "ldr q27, [x13, x28]\n" "ldr q26, [x12, x28]\n" "zip2 v25.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "ldp x10, x9, [%x[inptrs], #0x60]\n" "ldr q24, [x10, x28]\n" "zip1 v27.16b, v28.16b, v26.16b\n" "zip2 v26.16b, v28.16b, v26.16b\n" "ldr q23, [x9, x28]\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "zip1 v2.16b, v30.16b, v1.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "ldr q22, [x26, x28]\n" "ldr q21, [x25, x28]\n" "zip2 v20.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v23.16b, v21.16b\n" "zip2 v21.16b, v23.16b, v21.16b\n" "ldp x15, x14, [%x[inptrs], #0x0]\n" "ldp x13, x12, [%x[inptrs], #0x10]\n" "ldp x10, x9, [%x[inptrs], #0x20]\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "zip2 v28.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v25.16b, v26.16b\n" "zip2 v26.16b, v25.16b, v26.16b\n" "add %x[params], %x[params], #0x40\n" "zip2 v23.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v20.16b, v21.16b\n" "zip2 v21.16b, v20.16b, v21.16b\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" "beq 2f\n" "1:" // Loop "movi v19.4s, #0x0\n" ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n" ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n" "add x28, x28, #0x10\n" ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n" ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n" "subs x11, x11, #0x1\n" ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n" ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n" "ext v9.16b, v9.16b, v9.16b, #0x1\n" ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n" ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n" ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n" "movi v17.4s, #0x0\n" ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n" ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n" ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n" "ldr q4, [%x[params], #0x10]\n" ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n" "mls v5.4s, v19.4s, v11.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n" ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n" "ldr q9, [%x[params], #0x0]\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n" ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x60]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x40]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x50]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x30]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x70]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n" ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x20]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n" ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n" ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n" "ext v8.16b, v8.16b, v8.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n" ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n" ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n" ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n" ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n" "ldr q3, [x9, x28]\n" ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n" ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n" ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n" ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n" "ldr q8, [x14, x28]\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0xc0]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0xa0]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0xb0]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x90]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0xd0]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n" ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x80]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n" ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n" ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n" ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n" ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n" ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n" ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n" "ldr q2, [x26, x28]\n" ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n" ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n" ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n" ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n" "ldr q7, [x13, x28]\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x120]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x100]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x110]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0xf0]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x130]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n" ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0xe0]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n" ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n" ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n" ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n" ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n" ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n" ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n" "ext v26.16b, v26.16b, v26.16b, #0x1\n" ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n" "ldr q1, [x25, x28]\n" ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n" ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n" ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n" ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n" "ldr q6, [x12, x28]\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [x15, x28]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "ldp x15, x14, [%x[inptrs], #0x40]\n" "ldr q29, [x15, x28]\n" "ldr q28, [x14, x28]\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "ldp x13, x12, [%x[inptrs], #0x50]\n" "ldr q27, [x13, x28]\n" "ldr q26, [x12, x28]\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x160]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x170]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x150]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [x10, x28]\n" "ldp x10, x9, [%x[inptrs], #0x60]\n" "ldr q24, [x10, x28]\n" "ldr q23, [x9, x28]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "ldr q22, [x26, x28]\n" "ldr q21, [x25, x28]\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "ldp x15, x14, [%x[inptrs], #0x0]\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "ldp x13, x12, [%x[inptrs], #0x10]\n" "ldp x10, x9, [%x[inptrs], #0x20]\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "zip2 v5.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v8.16b, v6.16b\n" "zip2 v6.16b, v8.16b, v6.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "zip2 v8.16b, v9.16b, v7.16b\n" "str s20, [x21, x27]\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v5.16b, v6.16b\n" "add x27, x27, #0x4\n" "zip2 v6.16b, v5.16b, v6.16b\n" "ldr q5, [%x[params], #0x140]\n" "zip2 v30.16b, v4.16b, v2.16b\n" "add %x[params], %x[params], #0x180\n" "zip1 v4.16b, v4.16b, v2.16b\n" "zip1 v2.16b, v3.16b, v1.16b\n" "zip2 v1.16b, v3.16b, v1.16b\n" "zip2 v25.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v28.16b, v26.16b\n" "zip2 v26.16b, v28.16b, v26.16b\n" "zip2 v20.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v23.16b, v21.16b\n" "zip2 v21.16b, v23.16b, v21.16b\n" "zip2 v3.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "zip1 v2.16b, v30.16b, v1.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "zip2 v28.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v25.16b, v26.16b\n" "zip2 v26.16b, v25.16b, v26.16b\n" "zip2 v23.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v20.16b, v21.16b\n" "zip2 v21.16b, v20.16b, v21.16b\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" "bgt 1b\n" "2:" // Detached iteration "movi v19.4s, #0x0\n" ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n" ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n" "tst %x[n_channels], #0xf\n" ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n" ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n" "add x28, x28, #0x10\n" ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n" ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n" "ext v9.16b, v9.16b, v9.16b, #0x1\n" ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n" ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n" ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n" "movi v17.4s, #0x0\n" ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n" ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n" ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n" "ldr q4, [%x[params], #0x10]\n" ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n" "mls v5.4s, v19.4s, v11.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n" ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n" "ldr q9, [%x[params], #0x0]\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n" ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x60]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x40]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x50]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x30]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x70]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n" ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x20]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n" ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n" ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n" "ext v8.16b, v8.16b, v8.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n" ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n" ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n" ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n" ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n" ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n" ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n" ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n" ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0xc0]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0xa0]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0xb0]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0x90]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0xd0]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n" ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0x80]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n" ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n" ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n" ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n" ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n" ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n" ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n" ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n" ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n" ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n" ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "ldr q9, [%x[params], #0x120]\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "srshl v5.4s, v5.4s, v4.4s\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q16, [%x[params], #0x100]\n" "sqadd v25.4s, v25.4s, v31.4s\n" "ldr q31, [%x[params], #0x110]\n" "sqadd v20.4s, v20.4s, v0.4s\n" "ldr q0, [%x[params], #0xf0]\n" "add v5.4s, v5.4s, v10.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "ldr q4, [%x[params], #0x130]\n" "smax v5.4s, v5.4s, v13.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n" ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s5, [x24, x27]\n" "ldr q5, [%x[params], #0xe0]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n" "add %x[params], %x[params], #0x140\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "mov v30.16b, v5.16b\n" "str s20, [x21, x27]\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n" ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n" ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "add x27, x27, #0x4\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" "movi v17.4s, #0x0\n" ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n" ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n" ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n" ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n" ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n" "ext v26.16b, v26.16b, v26.16b, #0x1\n" ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n" ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n" ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n" ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n" ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "str s5, [x24, x27]\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s30, [x23, x27]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "add x27, x27, #0x4\n" "beq 35f\n" "3:" // Oddments "and x20, %x[n_channels], #0xf\n" "add x15, x15, x28\n" "add x14, x14, x28\n" "add x13, x13, x28\n" "add x12, x12, x28\n" "add x10, x10, x28\n" "add x9, x9, x28\n" "add x26, x26, x28\n" "add x25, x25, x28\n" "tbz %x[n_channels], #3, 7f\n" "ldr d9, [x15], #0x8\n" "ldr d8, [x14], #0x8\n" "ldr d7, [x13], #0x8\n" "ldr d6, [x12], #0x8\n" "ldr d4, [x10], #0x8\n" "ldr d3, [x9], #0x8\n" "ldr d2, [x26], #0x8\n" "ldr d1, [x25], #0x8\n" "tbz %x[n_channels], #2, 5f\n" "ld1 { v9.s }[2], [x15], #0x4\n" "ld1 { v8.s }[2], [x14], #0x4\n" "ld1 { v7.s }[2], [x13], #0x4\n" "ld1 { v6.s }[2], [x12], #0x4\n" "ld1 { v4.s }[2], [x10], #0x4\n" "ld1 { v3.s }[2], [x9], #0x4\n" "ld1 { v2.s }[2], [x26], #0x4\n" "ld1 { v1.s }[2], [x25], #0x4\n" "tbz %x[n_channels], #1, 4f\n" "ld1 { v9.h }[6], [x15], #0x2\n" "ld1 { v8.h }[6], [x14], #0x2\n" "ld1 { v7.h }[6], [x13], #0x2\n" "ld1 { v6.h }[6], [x12], #0x2\n" "ld1 { v4.h }[6], [x10], #0x2\n" "ld1 { v3.h }[6], [x9], #0x2\n" "ld1 { v2.h }[6], [x26], #0x2\n" "ld1 { v1.h }[6], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[14], [x15], #0x1\n" "ld1 { v8.b }[14], [x14], #0x1\n" "ld1 { v7.b }[14], [x13], #0x1\n" "ld1 { v6.b }[14], [x12], #0x1\n" "ld1 { v4.b }[14], [x10], #0x1\n" "ld1 { v3.b }[14], [x9], #0x1\n" "ld1 { v2.b }[14], [x26], #0x1\n" "ld1 { v1.b }[14], [x25], #0x1\n" "b 11f\n" "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[12], [x15], #0x1\n" "ld1 { v8.b }[12], [x14], #0x1\n" "ld1 { v7.b }[12], [x13], #0x1\n" "ld1 { v6.b }[12], [x12], #0x1\n" "ld1 { v4.b }[12], [x10], #0x1\n" "ld1 { v3.b }[12], [x9], #0x1\n" "ld1 { v2.b }[12], [x26], #0x1\n" "ld1 { v1.b }[12], [x25], #0x1\n" "b 11f\n" "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 6f\n" "ld1 { v9.h }[4], [x15], #0x2\n" "ld1 { v8.h }[4], [x14], #0x2\n" "ld1 { v7.h }[4], [x13], #0x2\n" "ld1 { v6.h }[4], [x12], #0x2\n" "ld1 { v4.h }[4], [x10], #0x2\n" "ld1 { v3.h }[4], [x9], #0x2\n" "ld1 { v2.h }[4], [x26], #0x2\n" "ld1 { v1.h }[4], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[10], [x15], #0x1\n" "ld1 { v8.b }[10], [x14], #0x1\n" "ld1 { v7.b }[10], [x13], #0x1\n" "ld1 { v6.b }[10], [x12], #0x1\n" "ld1 { v4.b }[10], [x10], #0x1\n" "ld1 { v3.b }[10], [x9], #0x1\n" "ld1 { v2.b }[10], [x26], #0x1\n" "ld1 { v1.b }[10], [x25], #0x1\n" "b 11f\n" "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[8], [x15], #0x1\n" "ld1 { v8.b }[8], [x14], #0x1\n" "ld1 { v7.b }[8], [x13], #0x1\n" "ld1 { v6.b }[8], [x12], #0x1\n" "ld1 { v4.b }[8], [x10], #0x1\n" "ld1 { v3.b }[8], [x9], #0x1\n" "ld1 { v2.b }[8], [x26], #0x1\n" "ld1 { v1.b }[8], [x25], #0x1\n" "b 11f\n" "7:" // Oddments: Load (A): Bit 3: Unset "tbz %x[n_channels], #2, 9f\n" "ldr s9, [x15], #0x4\n" "ldr s8, [x14], #0x4\n" "ldr s7, [x13], #0x4\n" "ldr s6, [x12], #0x4\n" "ldr s4, [x10], #0x4\n" "ldr s3, [x9], #0x4\n" "ldr s2, [x26], #0x4\n" "ldr s1, [x25], #0x4\n" "tbz %x[n_channels], #1, 8f\n" "ld1 { v9.h }[2], [x15], #0x2\n" "ld1 { v8.h }[2], [x14], #0x2\n" "ld1 { v7.h }[2], [x13], #0x2\n" "ld1 { v6.h }[2], [x12], #0x2\n" "ld1 { v4.h }[2], [x10], #0x2\n" "ld1 { v3.h }[2], [x9], #0x2\n" "ld1 { v2.h }[2], [x26], #0x2\n" "ld1 { v1.h }[2], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[6], [x15], #0x1\n" "ld1 { v8.b }[6], [x14], #0x1\n" "ld1 { v7.b }[6], [x13], #0x1\n" "ld1 { v6.b }[6], [x12], #0x1\n" "ld1 { v4.b }[6], [x10], #0x1\n" "ld1 { v3.b }[6], [x9], #0x1\n" "ld1 { v2.b }[6], [x26], #0x1\n" "ld1 { v1.b }[6], [x25], #0x1\n" "b 11f\n" "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[4], [x15], #0x1\n" "ld1 { v8.b }[4], [x14], #0x1\n" "ld1 { v7.b }[4], [x13], #0x1\n" "ld1 { v6.b }[4], [x12], #0x1\n" "ld1 { v4.b }[4], [x10], #0x1\n" "ld1 { v3.b }[4], [x9], #0x1\n" "ld1 { v2.b }[4], [x26], #0x1\n" "ld1 { v1.b }[4], [x25], #0x1\n" "b 11f\n" "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 10f\n" "ldr h9, [x15], #0x2\n" "ldr h8, [x14], #0x2\n" "ldr h7, [x13], #0x2\n" "ldr h6, [x12], #0x2\n" "ldr h4, [x10], #0x2\n" "ldr h3, [x9], #0x2\n" "ldr h2, [x26], #0x2\n" "ldr h1, [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v9.b }[2], [x15], #0x1\n" "ld1 { v8.b }[2], [x14], #0x1\n" "ld1 { v7.b }[2], [x13], #0x1\n" "ld1 { v6.b }[2], [x12], #0x1\n" "ld1 { v4.b }[2], [x10], #0x1\n" "ld1 { v3.b }[2], [x9], #0x1\n" "ld1 { v2.b }[2], [x26], #0x1\n" "ld1 { v1.b }[2], [x25], #0x1\n" "b 11f\n" "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b9, [x15], #0x1\n" "ldr b8, [x14], #0x1\n" "ldr b7, [x13], #0x1\n" "ldr b6, [x12], #0x1\n" "ldr b4, [x10], #0x1\n" "ldr b3, [x9], #0x1\n" "ldr b2, [x26], #0x1\n" "ldr b1, [x25], #0x1\n" "11:" // Oddments: Load (A): Bit 3: End "ldp x15, x14, [%x[inptrs], #0x40]\n" "ldp x13, x12, [%x[inptrs], #0x50]\n" "add x15, x15, x28\n" "add x14, x14, x28\n" "ldp x10, x9, [%x[inptrs], #0x60]\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "add x13, x13, x28\n" "add x12, x12, x28\n" "add x10, x10, x28\n" "add x9, x9, x28\n" "add x26, x26, x28\n" "add x25, x25, x28\n" "tbz %x[n_channels], #3, 15f\n" "ldr d29, [x15], #0x8\n" "ldr d28, [x14], #0x8\n" "ldr d27, [x13], #0x8\n" "ldr d26, [x12], #0x8\n" "ldr d24, [x10], #0x8\n" "ldr d23, [x9], #0x8\n" "ldr d22, [x26], #0x8\n" "ldr d21, [x25], #0x8\n" "tbz %x[n_channels], #2, 13f\n" "ld1 { v29.s }[2], [x15], #0x4\n" "ld1 { v28.s }[2], [x14], #0x4\n" "ld1 { v27.s }[2], [x13], #0x4\n" "ld1 { v26.s }[2], [x12], #0x4\n" "ld1 { v24.s }[2], [x10], #0x4\n" "ld1 { v23.s }[2], [x9], #0x4\n" "ld1 { v22.s }[2], [x26], #0x4\n" "ld1 { v21.s }[2], [x25], #0x4\n" "tbz %x[n_channels], #1, 12f\n" "ld1 { v29.h }[6], [x15], #0x2\n" "ld1 { v28.h }[6], [x14], #0x2\n" "ld1 { v27.h }[6], [x13], #0x2\n" "ld1 { v26.h }[6], [x12], #0x2\n" "ld1 { v24.h }[6], [x10], #0x2\n" "ld1 { v23.h }[6], [x9], #0x2\n" "ld1 { v22.h }[6], [x26], #0x2\n" "ld1 { v21.h }[6], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[14], [x15], #0x1\n" "ld1 { v28.b }[14], [x14], #0x1\n" "ld1 { v27.b }[14], [x13], #0x1\n" "ld1 { v26.b }[14], [x12], #0x1\n" "ld1 { v24.b }[14], [x10], #0x1\n" "ld1 { v23.b }[14], [x9], #0x1\n" "ld1 { v22.b }[14], [x26], #0x1\n" "ld1 { v21.b }[14], [x25], #0x1\n" "b 19f\n" "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[12], [x15], #0x1\n" "ld1 { v28.b }[12], [x14], #0x1\n" "ld1 { v27.b }[12], [x13], #0x1\n" "ld1 { v26.b }[12], [x12], #0x1\n" "ld1 { v24.b }[12], [x10], #0x1\n" "ld1 { v23.b }[12], [x9], #0x1\n" "ld1 { v22.b }[12], [x26], #0x1\n" "ld1 { v21.b }[12], [x25], #0x1\n" "b 19f\n" "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 14f\n" "ld1 { v29.h }[4], [x15], #0x2\n" "ld1 { v28.h }[4], [x14], #0x2\n" "ld1 { v27.h }[4], [x13], #0x2\n" "ld1 { v26.h }[4], [x12], #0x2\n" "ld1 { v24.h }[4], [x10], #0x2\n" "ld1 { v23.h }[4], [x9], #0x2\n" "ld1 { v22.h }[4], [x26], #0x2\n" "ld1 { v21.h }[4], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[10], [x15], #0x1\n" "ld1 { v28.b }[10], [x14], #0x1\n" "ld1 { v27.b }[10], [x13], #0x1\n" "ld1 { v26.b }[10], [x12], #0x1\n" "ld1 { v24.b }[10], [x10], #0x1\n" "ld1 { v23.b }[10], [x9], #0x1\n" "ld1 { v22.b }[10], [x26], #0x1\n" "ld1 { v21.b }[10], [x25], #0x1\n" "b 19f\n" "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[8], [x15], #0x1\n" "ld1 { v28.b }[8], [x14], #0x1\n" "ld1 { v27.b }[8], [x13], #0x1\n" "ld1 { v26.b }[8], [x12], #0x1\n" "ld1 { v24.b }[8], [x10], #0x1\n" "ld1 { v23.b }[8], [x9], #0x1\n" "ld1 { v22.b }[8], [x26], #0x1\n" "ld1 { v21.b }[8], [x25], #0x1\n" "b 19f\n" "15:" // Oddments: Load (B): Bit 3: Unset "tbz %x[n_channels], #2, 17f\n" "ldr s29, [x15], #0x4\n" "ldr s28, [x14], #0x4\n" "ldr s27, [x13], #0x4\n" "ldr s26, [x12], #0x4\n" "ldr s24, [x10], #0x4\n" "ldr s23, [x9], #0x4\n" "ldr s22, [x26], #0x4\n" "ldr s21, [x25], #0x4\n" "tbz %x[n_channels], #1, 16f\n" "ld1 { v29.h }[2], [x15], #0x2\n" "ld1 { v28.h }[2], [x14], #0x2\n" "ld1 { v27.h }[2], [x13], #0x2\n" "ld1 { v26.h }[2], [x12], #0x2\n" "ld1 { v24.h }[2], [x10], #0x2\n" "ld1 { v23.h }[2], [x9], #0x2\n" "ld1 { v22.h }[2], [x26], #0x2\n" "ld1 { v21.h }[2], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[6], [x15], #0x1\n" "ld1 { v28.b }[6], [x14], #0x1\n" "ld1 { v27.b }[6], [x13], #0x1\n" "ld1 { v26.b }[6], [x12], #0x1\n" "ld1 { v24.b }[6], [x10], #0x1\n" "ld1 { v23.b }[6], [x9], #0x1\n" "ld1 { v22.b }[6], [x26], #0x1\n" "ld1 { v21.b }[6], [x25], #0x1\n" "b 19f\n" "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[4], [x15], #0x1\n" "ld1 { v28.b }[4], [x14], #0x1\n" "ld1 { v27.b }[4], [x13], #0x1\n" "ld1 { v26.b }[4], [x12], #0x1\n" "ld1 { v24.b }[4], [x10], #0x1\n" "ld1 { v23.b }[4], [x9], #0x1\n" "ld1 { v22.b }[4], [x26], #0x1\n" "ld1 { v21.b }[4], [x25], #0x1\n" "b 19f\n" "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 18f\n" "ldr h29, [x15], #0x2\n" "ldr h28, [x14], #0x2\n" "ldr h27, [x13], #0x2\n" "ldr h26, [x12], #0x2\n" "ldr h24, [x10], #0x2\n" "ldr h23, [x9], #0x2\n" "ldr h22, [x26], #0x2\n" "ldr h21, [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v29.b }[2], [x15], #0x1\n" "ld1 { v28.b }[2], [x14], #0x1\n" "ld1 { v27.b }[2], [x13], #0x1\n" "ld1 { v26.b }[2], [x12], #0x1\n" "ld1 { v24.b }[2], [x10], #0x1\n" "ld1 { v23.b }[2], [x9], #0x1\n" "ld1 { v22.b }[2], [x26], #0x1\n" "ld1 { v21.b }[2], [x25], #0x1\n" "b 19f\n" "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b29, [x15], #0x1\n" "ldr b28, [x14], #0x1\n" "ldr b27, [x13], #0x1\n" "ldr b26, [x12], #0x1\n" "ldr b24, [x10], #0x1\n" "ldr b23, [x9], #0x1\n" "ldr b22, [x26], #0x1\n" "ldr b21, [x25], #0x1\n" "19:" // Oddments: Load (B): Bit 3: End "ldr q0, [%x[params], #0x10]\n" "ldr q16, [%x[params], #0x20]\n" "zip2 v30.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "ldr q31, [%x[params], #0x30]\n" "zip1 v2.16b, v3.16b, v1.16b\n" "zip2 v5.16b, v9.16b, v7.16b\n" "cmp x20, #0x4\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v8.16b, v6.16b\n" "zip2 v6.16b, v8.16b, v6.16b\n" "zip2 v1.16b, v3.16b, v1.16b\n" "zip2 v3.16b, v4.16b, v2.16b\n" "zip1 v4.16b, v4.16b, v2.16b\n" "zip2 v25.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v27.16b, v28.16b, v26.16b\n" "movi v19.4s, #0x0\n" ".inst 0x6e8495d3 // udot v19.4s, v14.16b, v4.16b\n" "zip2 v8.16b, v9.16b, v7.16b\n" "zip1 v9.16b, v9.16b, v7.16b\n" "zip1 v7.16b, v5.16b, v6.16b\n" "zip2 v6.16b, v5.16b, v6.16b\n" "ldr q5, [%x[params], #0x0]\n" "zip2 v26.16b, v28.16b, v26.16b\n" "zip2 v20.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v23.16b, v21.16b\n" "zip2 v21.16b, v23.16b, v21.16b\n" "zip2 v28.16b, v29.16b, v27.16b\n" "zip1 v29.16b, v29.16b, v27.16b\n" "zip1 v2.16b, v30.16b, v1.16b\n" ".inst 0x6e9d95d3 // udot v19.4s, v14.16b, v29.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "zip1 v27.16b, v25.16b, v26.16b\n" "zip2 v26.16b, v25.16b, v26.16b\n" "zip2 v23.16b, v24.16b, v22.16b\n" "zip1 v24.16b, v24.16b, v22.16b\n" "zip1 v22.16b, v20.16b, v21.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9895d2 // udot v18.4s, v14.16b, v24.16b\n" "zip2 v21.16b, v20.16b, v21.16b\n" "mov v30.16b, v5.16b\n" ".inst 0x6e8995d3 // udot v19.4s, v14.16b, v9.16b\n" "mov v25.16b, v5.16b\n" "mov v20.16b, v5.16b\n" ".inst 0x6e899405 // udot v5.4s, v0.16b, v9.16b\n" ".inst 0x6e849419 // udot v25.4s, v0.16b, v4.16b\n" ".inst 0x6e849605 // udot v5.4s, v16.16b, v4.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "ext v9.16b, v9.16b, v9.16b, #0x1\n" ".inst 0x6e9d9619 // udot v25.4s, v16.16b, v29.16b\n" ".inst 0x6e9d97e5 // udot v5.4s, v31.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x6e89941e // udot v30.4s, v0.16b, v9.16b\n" ".inst 0x6e849414 // udot v20.4s, v0.16b, v4.16b\n" "movi v17.4s, #0x0\n" ".inst 0x6e8495d1 // udot v17.4s, v14.16b, v4.16b\n" ".inst 0x6e9d95d1 // udot v17.4s, v14.16b, v29.16b\n" ".inst 0x6e9897f9 // udot v25.4s, v31.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x6e84961e // udot v30.4s, v16.16b, v4.16b\n" "ldr q4, [%x[params], #0x50]\n" ".inst 0x6e9d9614 // udot v20.4s, v16.16b, v29.16b\n" "mov v16.16b, v17.16b\n .inst 0x6e9895d0 // udot v16.4s, v14.16b, v24.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e8995d1 // udot v17.4s, v14.16b, v9.16b\n" "ldr q9, [%x[params], #0x40]\n" ".inst 0x6e9d97fe // udot v30.4s, v31.16b, v29.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" ".inst 0x6e9897f4 // udot v20.4s, v31.16b, v24.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "add %x[params], %x[params], #0x60\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "blt 20f\n" "str s5, [x24, x27]\n" "str s30, [x23, x27]\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "b 23f\n" "20:" // Oddments: Unroll 0: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 21f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 22f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 22f\n" "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End "23:" // Oddments: Unroll 0: After oddment store "subs x20, x20, #0x4\n" "add x27, x27, #0x4\n" "ble 35f\n" "ldr q5, [%x[params], #0x0]\n" "ldr q0, [%x[params], #0x10]\n" "movi v19.4s, #0x0\n" ".inst 0x6e8395d3 // udot v19.4s, v14.16b, v3.16b\n" "ldr q16, [%x[params], #0x20]\n" "ldr q31, [%x[params], #0x30]\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "ldr q9, [%x[params], #0x40]\n" "ldr q4, [%x[params], #0x50]\n" "mov v20.16b, v5.16b\n" ".inst 0x6e889405 // udot v5.4s, v0.16b, v8.16b\n" ".inst 0x6e9c95d3 // udot v19.4s, v14.16b, v28.16b\n" ".inst 0x6e839419 // udot v25.4s, v0.16b, v3.16b\n" "movi v17.4s, #0x0\n" "cmp x20, #0x4\n" ".inst 0x6e839605 // udot v5.4s, v16.16b, v3.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9795d2 // udot v18.4s, v14.16b, v23.16b\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "add %x[params], %x[params], #0x60\n" ".inst 0x6e8895d3 // udot v19.4s, v14.16b, v8.16b\n" "ext v8.16b, v8.16b, v8.16b, #0x1\n" ".inst 0x6e88941e // udot v30.4s, v0.16b, v8.16b\n" ".inst 0x6e839414 // udot v20.4s, v0.16b, v3.16b\n" ".inst 0x6e8395d1 // udot v17.4s, v14.16b, v3.16b\n" ".inst 0x6e9c9619 // udot v25.4s, v16.16b, v28.16b\n" ".inst 0x6e9c97e5 // udot v5.4s, v31.16b, v28.16b\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x6e83961e // udot v30.4s, v16.16b, v3.16b\n" ".inst 0x6e9c9614 // udot v20.4s, v16.16b, v28.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9c95d1 // udot v17.4s, v14.16b, v28.16b\n" ".inst 0x6e9797f9 // udot v25.4s, v31.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x6e9c97fe // udot v30.4s, v31.16b, v28.16b\n" ".inst 0x6e9797f4 // udot v20.4s, v31.16b, v23.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9795d0 // udot v16.4s, v14.16b, v23.16b\n" ".inst 0x6e8895d1 // udot v17.4s, v14.16b, v8.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "blt 24f\n" "str s5, [x24, x27]\n" "str s30, [x23, x27]\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "b 27f\n" "24:" // Oddments: Unroll 1: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 25f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 26f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 26f\n" "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End "27:" // Oddments: Unroll 1: After oddment store "subs x20, x20, #0x4\n" "add x27, x27, #0x4\n" "ble 35f\n" "ldr q5, [%x[params], #0x0]\n" "ldr q0, [%x[params], #0x10]\n" "movi v19.4s, #0x0\n" ".inst 0x6e8295d3 // udot v19.4s, v14.16b, v2.16b\n" "ldr q16, [%x[params], #0x20]\n" "ldr q31, [%x[params], #0x30]\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "ldr q9, [%x[params], #0x40]\n" "ldr q4, [%x[params], #0x50]\n" "mov v20.16b, v5.16b\n" ".inst 0x6e879405 // udot v5.4s, v0.16b, v7.16b\n" ".inst 0x6e9b95d3 // udot v19.4s, v14.16b, v27.16b\n" ".inst 0x6e829419 // udot v25.4s, v0.16b, v2.16b\n" "movi v17.4s, #0x0\n" "cmp x20, #0x4\n" ".inst 0x6e829605 // udot v5.4s, v16.16b, v2.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9695d2 // udot v18.4s, v14.16b, v22.16b\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "add %x[params], %x[params], #0x60\n" ".inst 0x6e8795d3 // udot v19.4s, v14.16b, v7.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" ".inst 0x6e87941e // udot v30.4s, v0.16b, v7.16b\n" ".inst 0x6e829414 // udot v20.4s, v0.16b, v2.16b\n" ".inst 0x6e8295d1 // udot v17.4s, v14.16b, v2.16b\n" ".inst 0x6e9b9619 // udot v25.4s, v16.16b, v27.16b\n" ".inst 0x6e9b97e5 // udot v5.4s, v31.16b, v27.16b\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x6e82961e // udot v30.4s, v16.16b, v2.16b\n" ".inst 0x6e9b9614 // udot v20.4s, v16.16b, v27.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9b95d1 // udot v17.4s, v14.16b, v27.16b\n" ".inst 0x6e9697f9 // udot v25.4s, v31.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x6e9b97fe // udot v30.4s, v31.16b, v27.16b\n" ".inst 0x6e9697f4 // udot v20.4s, v31.16b, v22.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9695d0 // udot v16.4s, v14.16b, v22.16b\n" ".inst 0x6e8795d1 // udot v17.4s, v14.16b, v7.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "blt 28f\n" "str s5, [x24, x27]\n" "str s30, [x23, x27]\n" "str s25, [x22, x27]\n" "str s20, [x21, x27]\n" "b 31f\n" "28:" // Oddments: Unroll 2: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 29f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 30f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 30f\n" "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End "31:" // Oddments: Unroll 2: After oddment store "subs x20, x20, #0x4\n" "add x27, x27, #0x4\n" "ble 35f\n" "ldr q5, [%x[params], #0x0]\n" "ldr q0, [%x[params], #0x10]\n" "movi v19.4s, #0x0\n" ".inst 0x6e8195d3 // udot v19.4s, v14.16b, v1.16b\n" "ldr q16, [%x[params], #0x20]\n" "ldr q31, [%x[params], #0x30]\n" "mov v30.16b, v5.16b\n" "mov v25.16b, v5.16b\n" "ldr q9, [%x[params], #0x40]\n" "ldr q4, [%x[params], #0x50]\n" "mov v20.16b, v5.16b\n" ".inst 0x6e869405 // udot v5.4s, v0.16b, v6.16b\n" ".inst 0x6e9a95d3 // udot v19.4s, v14.16b, v26.16b\n" ".inst 0x6e819419 // udot v25.4s, v0.16b, v1.16b\n" "movi v17.4s, #0x0\n" "add %x[params], %x[params], #0x60\n" ".inst 0x6e819605 // udot v5.4s, v16.16b, v1.16b\n" "mov v18.16b, v19.16b\n .inst 0x6e9595d2 // udot v18.4s, v14.16b, v21.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x6e8695d3 // udot v19.4s, v14.16b, v6.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" ".inst 0x6e86941e // udot v30.4s, v0.16b, v6.16b\n" ".inst 0x6e819414 // udot v20.4s, v0.16b, v1.16b\n" ".inst 0x6e8195d1 // udot v17.4s, v14.16b, v1.16b\n" ".inst 0x6e9a9619 // udot v25.4s, v16.16b, v26.16b\n" ".inst 0x6e9a97e5 // udot v5.4s, v31.16b, v26.16b\n" "ext v26.16b, v26.16b, v26.16b, #0x1\n" ".inst 0x6e81961e // udot v30.4s, v16.16b, v1.16b\n" ".inst 0x6e9a9614 // udot v20.4s, v16.16b, v26.16b\n" "mls v5.4s, v19.4s, v11.4s\n" ".inst 0x6e9a95d1 // udot v17.4s, v14.16b, v26.16b\n" ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x6e9a97fe // udot v30.4s, v31.16b, v26.16b\n" ".inst 0x6e9597f4 // udot v20.4s, v31.16b, v21.16b\n" "sqrdmulh v5.4s, v5.4s, v9.4s\n" "mov v16.16b, v17.16b\n .inst 0x6e9595d0 // udot v16.4s, v14.16b, v21.16b\n" ".inst 0x6e8695d1 // udot v17.4s, v14.16b, v6.16b\n" "mls v30.4s, v17.4s, v11.4s\n" "mls v25.4s, v18.4s, v11.4s\n" "mls v20.4s, v16.4s, v11.4s\n" "and v0.16b, v5.16b, v4.16b\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqrdmulh v30.4s, v30.4s, v9.4s\n" "sqrdmulh v25.4s, v25.4s, v9.4s\n" "sqrdmulh v20.4s, v20.4s, v9.4s\n" "sqadd v5.4s, v5.4s, v0.4s\n" "and v16.16b, v30.16b, v4.16b\n" "and v31.16b, v25.16b, v4.16b\n" "and v0.16b, v20.16b, v4.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sshr v31.4s, v31.4s, #0x1f\n" "sshr v0.4s, v0.4s, #0x1f\n" "sqadd v30.4s, v30.4s, v16.4s\n" "sqadd v25.4s, v25.4s, v31.4s\n" "sqadd v20.4s, v20.4s, v0.4s\n" "srshl v5.4s, v5.4s, v4.4s\n" "srshl v30.4s, v30.4s, v4.4s\n" "srshl v25.4s, v25.4s, v4.4s\n" "srshl v20.4s, v20.4s, v4.4s\n" "add v5.4s, v5.4s, v10.4s\n" "add v30.4s, v30.4s, v10.4s\n" "add v25.4s, v25.4s, v10.4s\n" "add v20.4s, v20.4s, v10.4s\n" "smax v5.4s, v5.4s, v13.4s\n" "smax v30.4s, v30.4s, v13.4s\n" "smax v25.4s, v25.4s, v13.4s\n" "smax v20.4s, v20.4s, v13.4s\n" "smin v5.4s, v5.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v5.16b, v5.16b, v5.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "32:" // Oddments: Unroll 3: Oddment store "add x24, x24, x27\n" "add x23, x23, x27\n" "add x22, x22, x27\n" "add x21, x21, x27\n" "tbz x20, #1, 33f\n" "st1 { v5.h }[0], [x24], #0x2\n" "st1 { v30.h }[0], [x23], #0x2\n" "st1 { v25.h }[0], [x22], #0x2\n" "st1 { v20.h }[0], [x21], #0x2\n" "tbz x20, #0, 34f\n" "st1 { v5.b }[2], [x24], #0x1\n" "st1 { v30.b }[2], [x23], #0x1\n" "st1 { v25.b }[2], [x22], #0x1\n" "st1 { v20.b }[2], [x21], #0x1\n" "b 34f\n" "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset "st1 { v5.b }[0], [x24], #0x1\n" "st1 { v30.b }[0], [x23], #0x1\n" "st1 { v25.b }[0], [x22], #0x1\n" "st1 { v20.b }[0], [x21], #0x1\n" "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End "35:" // End : [params] "+&r" (params) : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } } // namespace depthwise } // namespace arm_conv #endif // defined(__aarch64__)