/* * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(__aarch64__) #include "arm_gemm.hpp" #include namespace arm_conv { namespace depthwise { void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs) { __asm__ __volatile__( "lsr x15, %x[n_channels], #0x4\n" "add x20, %x[qp], %[offsetof_Requantize32_minval]\n" "ld1r { v8.4s }, [x20]\n" "ldp x14, x13, [%x[inptrs], #0x0]\n" "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" "ld1r { v12.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n" "ld1r { v15.4s }, [x20]\n" "mov x12, #0x0\n" "mov x11, #0x0\n" "ldp x10, x9, [%x[inptrs], #0x10]\n" "ldp x28, x27, [%x[inptrs], #0x20]\n" "ldp x26, x21, [%x[inptrs], #0x30]\n" "ldp x25, x24, [%x[outptrs], #0x0]\n" "ldp x23, x22, [%x[outptrs], #0x10]\n" "cbz x15, 3f\n" "ldr q11, [x14, x12]\n" "ldr q20, [x13, x12]\n" "subs x15, x15, #0x1\n" "ldr q16, [x10, x12]\n" "ldr q14, [x9, x12]\n" "zip2 v19.16b, v11.16b, v16.16b\n" "zip1 v11.16b, v11.16b, v16.16b\n" "ldr q13, [x28, x12]\n" "ldr q18, [x27, x12]\n" "zip1 v17.16b, v20.16b, v14.16b\n" "zip2 v14.16b, v20.16b, v14.16b\n" "ldr q16, [x26, x12]\n" "ldr q27, [x21, x12]\n" "zip2 v10.16b, v11.16b, v17.16b\n" "zip1 v11.16b, v11.16b, v17.16b\n" "ldr q24, [%x[params], #0x10]\n" "ldr q9, [%x[params], #0x20]\n" "zip1 v3.16b, v19.16b, v14.16b\n" "zip2 v14.16b, v19.16b, v14.16b\n" "ldr q31, [%x[params], #0x0]\n" "ldr q6, [%x[params], #0x30]\n" "zip2 v30.16b, v13.16b, v16.16b\n" "zip1 v13.16b, v13.16b, v16.16b\n" "ldp x21, x20, [%x[inptrs], #0x40]\n" "ldr q5, [x21, x12]\n" "zip1 v16.16b, v18.16b, v27.16b\n" "zip2 v27.16b, v18.16b, v27.16b\n" "ldr q17, [x20, x12]\n" "ldp x21, x20, [%x[inptrs], #0x50]\n" "zip2 v28.16b, v13.16b, v16.16b\n" "zip1 v13.16b, v13.16b, v16.16b\n" "ldr q16, [x21, x12]\n" "ldr q7, [x20, x12]\n" "zip2 v20.16b, v5.16b, v16.16b\n" "zip1 v5.16b, v5.16b, v16.16b\n" "ldp x21, x20, [%x[inptrs], #0x60]\n" "ldr q16, [x21, x12]\n" "zip1 v22.16b, v17.16b, v7.16b\n" "zip2 v7.16b, v17.16b, v7.16b\n" "ldr q19, [x20, x12]\n" "ldp x21, x20, [%x[inptrs], #0x70]\n" "zip1 v21.16b, v30.16b, v27.16b\n" "zip2 v27.16b, v30.16b, v27.16b\n" "ldr q30, [x21, x12]\n" "ldr q1, [x20, x12]\n" "zip2 v17.16b, v16.16b, v30.16b\n" "zip1 v16.16b, v16.16b, v30.16b\n" "zip1 v18.16b, v19.16b, v1.16b\n" "zip2 v1.16b, v19.16b, v1.16b\n" "ldp x14, x13, [%x[inptrs], #0x0]\n" "ldp x10, x9, [%x[inptrs], #0x10]\n" "ldp x28, x27, [%x[inptrs], #0x20]\n" "ldp x26, x21, [%x[inptrs], #0x30]\n" "zip2 v29.16b, v5.16b, v22.16b\n" "zip1 v5.16b, v5.16b, v22.16b\n" "zip1 v0.16b, v20.16b, v7.16b\n" "zip2 v7.16b, v20.16b, v7.16b\n" "add %x[params], %x[params], #0x40\n" "zip2 v30.16b, v16.16b, v18.16b\n" "zip1 v16.16b, v16.16b, v18.16b\n" "zip1 v2.16b, v17.16b, v1.16b\n" "zip2 v1.16b, v17.16b, v1.16b\n" "mov v26.16b, v31.16b\n" "mov v18.16b, v31.16b\n" "mov v4.16b, v31.16b\n" "beq 2f\n" "1:" // Loop ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n" ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n" "ext v11.16b, v11.16b, v11.16b, #0x1\n" "add x12, x12, #0x10\n" ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n" "ext v13.16b, v13.16b, v13.16b, #0x1\n" ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n" "ldr q17, [%x[params], #0x0]\n" ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n" ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n" "subs x15, x15, #0x1\n" ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n" "ext v5.16b, v5.16b, v5.16b, #0x1\n" ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n" "ldr q20, [%x[params], #0x10]\n" ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n" ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n" "ext v16.16b, v16.16b, v16.16b, #0x1\n" "sqrdmulh v31.4s, v31.4s, v17.4s\n" ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n" ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n" "and v16.16b, v31.16b, v20.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v17.4s\n" "sqrdmulh v18.4s, v18.4s, v17.4s\n" "sqrdmulh v4.4s, v4.4s, v17.4s\n" "ldr q5, [%x[params], #0x60]\n" "sqadd v31.4s, v31.4s, v16.4s\n" "and v19.16b, v26.16b, v20.16b\n" "and v17.16b, v18.16b, v20.16b\n" "and v16.16b, v4.16b, v20.16b\n" "sshr v19.4s, v19.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v31.4s, v31.4s, v20.4s\n" "sqadd v26.4s, v26.4s, v19.4s\n" "ldr q13, [%x[params], #0x40]\n" "sqadd v18.4s, v18.4s, v17.4s\n" "ldr q17, [%x[params], #0x50]\n" "sqadd v4.4s, v4.4s, v16.4s\n" "ldr q16, [%x[params], #0x30]\n" "add v31.4s, v31.4s, v15.4s\n" "srshl v26.4s, v26.4s, v20.4s\n" "srshl v18.4s, v18.4s, v20.4s\n" "srshl v4.4s, v4.4s, v20.4s\n" "ldr q22, [%x[params], #0x70]\n" "smax v31.4s, v31.4s, v8.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v18.4s, v18.4s, v15.4s\n" "add v4.4s, v4.4s, v15.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v18.4s, v18.4s, v8.4s\n" "smax v4.4s, v4.4s, v8.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v18.4s, v18.4s, v12.4s\n" "smin v4.4s, v4.4s, v12.4s\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s31, [x25, x11]\n" "ldr q24, [%x[params], #0x20]\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "str s26, [x24, x11]\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "str s18, [x23, x11]\n" "mov v26.16b, v24.16b\n" "str s4, [x22, x11]\n" "mov v25.16b, v24.16b\n" "mov v23.16b, v24.16b\n" ".inst 0x4e8a9618 // sdot v24.4s, v16.16b, v10.16b\n" ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n" ".inst 0x4e9c95b8 // sdot v24.4s, v13.16b, v28.16b\n" "ext v10.16b, v10.16b, v10.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x4e8a961a // sdot v26.4s, v16.16b, v10.16b\n" "ldr q10, [x13, x12]\n" ".inst 0x4e9c9617 // sdot v23.4s, v16.16b, v28.16b\n" ".inst 0x4e9d95b9 // sdot v25.4s, v13.16b, v29.16b\n" ".inst 0x4e9d9638 // sdot v24.4s, v17.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x4e9c95ba // sdot v26.4s, v13.16b, v28.16b\n" "ldr q20, [x27, x12]\n" ".inst 0x4e9d95b7 // sdot v23.4s, v13.16b, v29.16b\n" "sqrdmulh v24.4s, v24.4s, v5.4s\n" ".inst 0x4e9e9639 // sdot v25.4s, v17.16b, v30.16b\n" "ext v30.16b, v30.16b, v30.16b, #0x1\n" ".inst 0x4e9d963a // sdot v26.4s, v17.16b, v29.16b\n" ".inst 0x4e9e9637 // sdot v23.4s, v17.16b, v30.16b\n" "and v16.16b, v24.16b, v22.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v5.4s\n" "sqrdmulh v25.4s, v25.4s, v5.4s\n" "sqrdmulh v23.4s, v23.4s, v5.4s\n" "ldr q19, [%x[params], #0xc0]\n" "sqadd v24.4s, v24.4s, v16.4s\n" "and v18.16b, v26.16b, v22.16b\n" "and v17.16b, v25.16b, v22.16b\n" "and v16.16b, v23.16b, v22.16b\n" "sshr v18.4s, v18.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v24.4s, v24.4s, v22.4s\n" "sqadd v26.4s, v26.4s, v18.4s\n" "ldr q18, [%x[params], #0xa0]\n" "sqadd v25.4s, v25.4s, v17.4s\n" "ldr q17, [%x[params], #0xb0]\n" "sqadd v23.4s, v23.4s, v16.4s\n" "ldr q16, [%x[params], #0x90]\n" "add v24.4s, v24.4s, v15.4s\n" "srshl v26.4s, v26.4s, v22.4s\n" "srshl v25.4s, v25.4s, v22.4s\n" "srshl v23.4s, v23.4s, v22.4s\n" "ldr q22, [%x[params], #0xd0]\n" "smax v24.4s, v24.4s, v8.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v25.4s, v25.4s, v15.4s\n" "add v23.4s, v23.4s, v15.4s\n" "smin v24.4s, v24.4s, v12.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v25.4s, v25.4s, v8.4s\n" "smax v23.4s, v23.4s, v8.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smin v23.4s, v23.4s, v12.4s\n" "uzp1 v24.16b, v24.16b, v24.16b\n" "uzp1 v24.16b, v24.16b, v24.16b\n" "str s24, [x25, x11]\n" "ldr q24, [%x[params], #0x80]\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v23.16b, v23.16b, v23.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s26, [x24, x11]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v23.16b, v23.16b, v23.16b\n" "str s25, [x23, x11]\n" "str s23, [x22, x11]\n" "mov v23.16b, v24.16b\n" "mov v31.16b, v24.16b\n" ".inst 0x4e95961f // sdot v31.4s, v16.16b, v21.16b\n" "mov v13.16b, v24.16b\n" ".inst 0x4e839618 // sdot v24.4s, v16.16b, v3.16b\n" ".inst 0x4e959658 // sdot v24.4s, v18.16b, v21.16b\n" "add x11, x11, #0x4\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x4e839617 // sdot v23.4s, v16.16b, v3.16b\n" "ldr q3, [x10, x12]\n" ".inst 0x4e95960d // sdot v13.4s, v16.16b, v21.16b\n" ".inst 0x4e80965f // sdot v31.4s, v18.16b, v0.16b\n" ".inst 0x4e809638 // sdot v24.4s, v17.16b, v0.16b\n" "ext v0.16b, v0.16b, v0.16b, #0x1\n" ".inst 0x4e959657 // sdot v23.4s, v18.16b, v21.16b\n" "ldr q4, [x26, x12]\n" ".inst 0x4e80964d // sdot v13.4s, v18.16b, v0.16b\n" ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "sqrdmulh v24.4s, v24.4s, v19.4s\n" ".inst 0x4e809637 // sdot v23.4s, v17.16b, v0.16b\n" ".inst 0x4e82962d // sdot v13.4s, v17.16b, v2.16b\n" "and v16.16b, v24.16b, v22.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v23.4s, v23.4s, v19.4s\n" "sqrdmulh v31.4s, v31.4s, v19.4s\n" "sqrdmulh v13.4s, v13.4s, v19.4s\n" "ldr q19, [%x[params], #0x120]\n" "sqadd v24.4s, v24.4s, v16.4s\n" "and v18.16b, v23.16b, v22.16b\n" "and v17.16b, v31.16b, v22.16b\n" "and v16.16b, v13.16b, v22.16b\n" "sshr v18.4s, v18.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v24.4s, v24.4s, v22.4s\n" "sqadd v23.4s, v23.4s, v18.4s\n" "ldr q18, [%x[params], #0x100]\n" "sqadd v31.4s, v31.4s, v17.4s\n" "ldr q17, [%x[params], #0x110]\n" "sqadd v13.4s, v13.4s, v16.4s\n" "ldr q16, [%x[params], #0xf0]\n" "add v24.4s, v24.4s, v15.4s\n" "srshl v23.4s, v23.4s, v22.4s\n" "srshl v31.4s, v31.4s, v22.4s\n" "srshl v13.4s, v13.4s, v22.4s\n" "ldr q22, [%x[params], #0x130]\n" "smax v24.4s, v24.4s, v8.4s\n" "add v23.4s, v23.4s, v15.4s\n" "add v31.4s, v31.4s, v15.4s\n" "add v13.4s, v13.4s, v15.4s\n" "smin v24.4s, v24.4s, v12.4s\n" "smax v23.4s, v23.4s, v8.4s\n" "smax v31.4s, v31.4s, v8.4s\n" "smax v13.4s, v13.4s, v8.4s\n" "smin v23.4s, v23.4s, v12.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smin v13.4s, v13.4s, v12.4s\n" "uzp1 v24.16b, v24.16b, v24.16b\n" "uzp1 v24.16b, v24.16b, v24.16b\n" "uzp1 v23.16b, v23.16b, v23.16b\n" "str s24, [x25, x11]\n" "ldr q2, [%x[params], #0xe0]\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v13.16b, v13.16b, v13.16b\n" "uzp1 v23.16b, v23.16b, v23.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "str s23, [x24, x11]\n" "uzp1 v13.16b, v13.16b, v13.16b\n" "str s31, [x23, x11]\n" "mov v25.16b, v2.16b\n" "str s13, [x22, x11]\n" "mov v21.16b, v2.16b\n" "mov v30.16b, v2.16b\n" ".inst 0x4e8e9602 // sdot v2.4s, v16.16b, v14.16b\n" ".inst 0x4e9b9615 // sdot v21.4s, v16.16b, v27.16b\n" ".inst 0x4e9b9642 // sdot v2.4s, v18.16b, v27.16b\n" "ext v14.16b, v14.16b, v14.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x4e8e9619 // sdot v25.4s, v16.16b, v14.16b\n" "ldr q14, [x9, x12]\n" ".inst 0x4e9b961e // sdot v30.4s, v16.16b, v27.16b\n" ".inst 0x4e879655 // sdot v21.4s, v18.16b, v7.16b\n" ".inst 0x4e879622 // sdot v2.4s, v17.16b, v7.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" ".inst 0x4e9b9659 // sdot v25.4s, v18.16b, v27.16b\n" "ldr q27, [x21, x12]\n" ".inst 0x4e87965e // sdot v30.4s, v18.16b, v7.16b\n" "sqrdmulh v2.4s, v2.4s, v19.4s\n" ".inst 0x4e819635 // sdot v21.4s, v17.16b, v1.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x4e879639 // sdot v25.4s, v17.16b, v7.16b\n" ".inst 0x4e81963e // sdot v30.4s, v17.16b, v1.16b\n" "and v16.16b, v2.16b, v22.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v25.4s, v25.4s, v19.4s\n" "sqrdmulh v21.4s, v21.4s, v19.4s\n" "sqrdmulh v30.4s, v30.4s, v19.4s\n" "ldr q11, [x14, x12]\n" "ldp x21, x20, [%x[inptrs], #0x40]\n" "ldr q5, [x21, x12]\n" "ldr q29, [x20, x12]\n" "sqadd v2.4s, v2.4s, v16.4s\n" "and v19.16b, v25.16b, v22.16b\n" "and v17.16b, v21.16b, v22.16b\n" "and v16.16b, v30.16b, v22.16b\n" "ldp x21, x20, [%x[inptrs], #0x50]\n" "ldr q26, [x21, x12]\n" "ldr q7, [x20, x12]\n" "sshr v19.4s, v19.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v2.4s, v2.4s, v22.4s\n" "sqadd v25.4s, v25.4s, v19.4s\n" "ldr q9, [%x[params], #0x160]\n" "sqadd v21.4s, v21.4s, v17.4s\n" "ldr q6, [%x[params], #0x170]\n" "sqadd v30.4s, v30.4s, v16.4s\n" "ldr q24, [%x[params], #0x150]\n" "add v2.4s, v2.4s, v15.4s\n" "srshl v25.4s, v25.4s, v22.4s\n" "srshl v21.4s, v21.4s, v22.4s\n" "srshl v30.4s, v30.4s, v22.4s\n" "ldr q13, [x28, x12]\n" "smax v2.4s, v2.4s, v8.4s\n" "ldp x21, x20, [%x[inptrs], #0x60]\n" "ldr q16, [x21, x12]\n" "ldr q28, [x20, x12]\n" "add v25.4s, v25.4s, v15.4s\n" "add v21.4s, v21.4s, v15.4s\n" "add v30.4s, v30.4s, v15.4s\n" "smin v2.4s, v2.4s, v12.4s\n" "ldp x21, x20, [%x[inptrs], #0x70]\n" "ldr q23, [x21, x12]\n" "ldr q1, [x20, x12]\n" "smax v25.4s, v25.4s, v8.4s\n" "smax v21.4s, v21.4s, v8.4s\n" "ldp x14, x13, [%x[inptrs], #0x0]\n" "smax v30.4s, v30.4s, v8.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "ldp x10, x9, [%x[inptrs], #0x10]\n" "ldp x28, x27, [%x[inptrs], #0x20]\n" "smin v21.4s, v21.4s, v12.4s\n" "smin v30.4s, v30.4s, v12.4s\n" "ldp x26, x21, [%x[inptrs], #0x30]\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "str s2, [x25, x11]\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "zip2 v18.16b, v11.16b, v3.16b\n" "zip1 v11.16b, v11.16b, v3.16b\n" "zip1 v17.16b, v10.16b, v14.16b\n" "zip2 v14.16b, v10.16b, v14.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s25, [x24, x11]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v30.16b, v30.16b, v30.16b\n" "str s21, [x23, x11]\n" "str s30, [x22, x11]\n" "zip2 v10.16b, v11.16b, v17.16b\n" "zip1 v11.16b, v11.16b, v17.16b\n" "add x11, x11, #0x4\n" "zip1 v3.16b, v18.16b, v14.16b\n" "zip2 v14.16b, v18.16b, v14.16b\n" "ldr q31, [%x[params], #0x140]\n" "add %x[params], %x[params], #0x180\n" "zip2 v22.16b, v13.16b, v4.16b\n" "zip1 v13.16b, v13.16b, v4.16b\n" "zip1 v2.16b, v20.16b, v27.16b\n" "zip2 v27.16b, v20.16b, v27.16b\n" "zip2 v19.16b, v5.16b, v26.16b\n" "zip1 v5.16b, v5.16b, v26.16b\n" "zip1 v18.16b, v29.16b, v7.16b\n" "zip2 v7.16b, v29.16b, v7.16b\n" "zip2 v4.16b, v16.16b, v23.16b\n" "zip1 v16.16b, v16.16b, v23.16b\n" "zip1 v17.16b, v28.16b, v1.16b\n" "zip2 v1.16b, v28.16b, v1.16b\n" "zip2 v28.16b, v13.16b, v2.16b\n" "zip1 v13.16b, v13.16b, v2.16b\n" "zip1 v21.16b, v22.16b, v27.16b\n" "zip2 v27.16b, v22.16b, v27.16b\n" "zip2 v29.16b, v5.16b, v18.16b\n" "zip1 v5.16b, v5.16b, v18.16b\n" "zip1 v0.16b, v19.16b, v7.16b\n" "zip2 v7.16b, v19.16b, v7.16b\n" "zip2 v30.16b, v16.16b, v17.16b\n" "zip1 v16.16b, v16.16b, v17.16b\n" "zip1 v2.16b, v4.16b, v1.16b\n" "zip2 v1.16b, v4.16b, v1.16b\n" "mov v26.16b, v31.16b\n" "mov v18.16b, v31.16b\n" "mov v4.16b, v31.16b\n" "bgt 1b\n" "2:" // Detached iteration ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n" ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n" "ext v11.16b, v11.16b, v11.16b, #0x1\n" "tst %x[n_channels], #0xf\n" ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n" "ext v13.16b, v13.16b, v13.16b, #0x1\n" ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n" "ldr q17, [%x[params], #0x0]\n" ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n" ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n" "add x12, x12, #0x10\n" ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n" "ext v5.16b, v5.16b, v5.16b, #0x1\n" ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n" "ldr q19, [%x[params], #0x10]\n" ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n" ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n" "ext v16.16b, v16.16b, v16.16b, #0x1\n" "sqrdmulh v31.4s, v31.4s, v17.4s\n" ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n" ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n" "and v16.16b, v31.16b, v19.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v17.4s\n" "sqrdmulh v18.4s, v18.4s, v17.4s\n" "sqrdmulh v4.4s, v4.4s, v17.4s\n" "ldr q24, [%x[params], #0x60]\n" "sqadd v31.4s, v31.4s, v16.4s\n" "and v20.16b, v26.16b, v19.16b\n" "and v17.16b, v18.16b, v19.16b\n" "and v16.16b, v4.16b, v19.16b\n" "sshr v20.4s, v20.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v31.4s, v31.4s, v19.4s\n" "sqadd v26.4s, v26.4s, v20.4s\n" "ldr q5, [%x[params], #0x40]\n" "sqadd v18.4s, v18.4s, v17.4s\n" "ldr q17, [%x[params], #0x50]\n" "sqadd v4.4s, v4.4s, v16.4s\n" "ldr q16, [%x[params], #0x30]\n" "add v31.4s, v31.4s, v15.4s\n" "srshl v26.4s, v26.4s, v19.4s\n" "srshl v18.4s, v18.4s, v19.4s\n" "srshl v4.4s, v4.4s, v19.4s\n" "ldr q23, [%x[params], #0x70]\n" "smax v31.4s, v31.4s, v8.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v18.4s, v18.4s, v15.4s\n" "add v4.4s, v4.4s, v15.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v18.4s, v18.4s, v8.4s\n" "smax v4.4s, v4.4s, v8.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v18.4s, v18.4s, v12.4s\n" "smin v4.4s, v4.4s, v12.4s\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s31, [x25, x11]\n" "ldr q25, [%x[params], #0x20]\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "str s26, [x24, x11]\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "str s18, [x23, x11]\n" "mov v22.16b, v25.16b\n" "str s4, [x22, x11]\n" "mov v20.16b, v25.16b\n" "mov v19.16b, v25.16b\n" ".inst 0x4e8a9619 // sdot v25.4s, v16.16b, v10.16b\n" ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n" ".inst 0x4e9c94b9 // sdot v25.4s, v5.16b, v28.16b\n" "ext v10.16b, v10.16b, v10.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x4e8a9616 // sdot v22.4s, v16.16b, v10.16b\n" ".inst 0x4e9c9613 // sdot v19.4s, v16.16b, v28.16b\n" ".inst 0x4e9d94b4 // sdot v20.4s, v5.16b, v29.16b\n" ".inst 0x4e9d9639 // sdot v25.4s, v17.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x4e9c94b6 // sdot v22.4s, v5.16b, v28.16b\n" ".inst 0x4e9d94b3 // sdot v19.4s, v5.16b, v29.16b\n" "sqrdmulh v25.4s, v25.4s, v24.4s\n" ".inst 0x4e9e9634 // sdot v20.4s, v17.16b, v30.16b\n" "ext v30.16b, v30.16b, v30.16b, #0x1\n" ".inst 0x4e9d9636 // sdot v22.4s, v17.16b, v29.16b\n" ".inst 0x4e9e9633 // sdot v19.4s, v17.16b, v30.16b\n" "and v16.16b, v25.16b, v23.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v22.4s, v22.4s, v24.4s\n" "sqrdmulh v20.4s, v20.4s, v24.4s\n" "sqrdmulh v19.4s, v19.4s, v24.4s\n" "ldr q24, [%x[params], #0xc0]\n" "sqadd v25.4s, v25.4s, v16.4s\n" "and v18.16b, v22.16b, v23.16b\n" "and v17.16b, v20.16b, v23.16b\n" "and v16.16b, v19.16b, v23.16b\n" "sshr v18.4s, v18.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v25.4s, v25.4s, v23.4s\n" "sqadd v22.4s, v22.4s, v18.4s\n" "ldr q18, [%x[params], #0xa0]\n" "sqadd v20.4s, v20.4s, v17.4s\n" "ldr q17, [%x[params], #0xb0]\n" "sqadd v19.4s, v19.4s, v16.4s\n" "ldr q16, [%x[params], #0x90]\n" "add v25.4s, v25.4s, v15.4s\n" "srshl v22.4s, v22.4s, v23.4s\n" "srshl v20.4s, v20.4s, v23.4s\n" "srshl v19.4s, v19.4s, v23.4s\n" "ldr q23, [%x[params], #0xd0]\n" "smax v25.4s, v25.4s, v8.4s\n" "add v22.4s, v22.4s, v15.4s\n" "add v20.4s, v20.4s, v15.4s\n" "add v19.4s, v19.4s, v15.4s\n" "smin v25.4s, v25.4s, v12.4s\n" "smax v22.4s, v22.4s, v8.4s\n" "smax v20.4s, v20.4s, v8.4s\n" "smax v19.4s, v19.4s, v8.4s\n" "smin v22.4s, v22.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "smin v19.4s, v19.4s, v12.4s\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "uzp1 v25.16b, v25.16b, v25.16b\n" "str s25, [x25, x11]\n" "ldr q10, [%x[params], #0x80]\n" "uzp1 v22.16b, v22.16b, v22.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v19.16b, v19.16b, v19.16b\n" "uzp1 v22.16b, v22.16b, v22.16b\n" "str s22, [x24, x11]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v19.16b, v19.16b, v19.16b\n" "str s20, [x23, x11]\n" "str s19, [x22, x11]\n" "mov v28.16b, v10.16b\n" "mov v20.16b, v10.16b\n" ".inst 0x4e959614 // sdot v20.4s, v16.16b, v21.16b\n" "mov v19.16b, v10.16b\n" ".inst 0x4e83960a // sdot v10.4s, v16.16b, v3.16b\n" ".inst 0x4e95964a // sdot v10.4s, v18.16b, v21.16b\n" "add x11, x11, #0x4\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x4e83961c // sdot v28.4s, v16.16b, v3.16b\n" ".inst 0x4e959613 // sdot v19.4s, v16.16b, v21.16b\n" ".inst 0x4e809654 // sdot v20.4s, v18.16b, v0.16b\n" ".inst 0x4e80962a // sdot v10.4s, v17.16b, v0.16b\n" "ext v0.16b, v0.16b, v0.16b, #0x1\n" ".inst 0x4e95965c // sdot v28.4s, v18.16b, v21.16b\n" ".inst 0x4e809653 // sdot v19.4s, v18.16b, v0.16b\n" ".inst 0x4e829634 // sdot v20.4s, v17.16b, v2.16b\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "sqrdmulh v10.4s, v10.4s, v24.4s\n" ".inst 0x4e80963c // sdot v28.4s, v17.16b, v0.16b\n" ".inst 0x4e829633 // sdot v19.4s, v17.16b, v2.16b\n" "and v16.16b, v10.16b, v23.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v28.4s, v28.4s, v24.4s\n" "sqrdmulh v20.4s, v20.4s, v24.4s\n" "sqrdmulh v19.4s, v19.4s, v24.4s\n" "ldr q24, [%x[params], #0x120]\n" "sqadd v10.4s, v10.4s, v16.4s\n" "and v18.16b, v28.16b, v23.16b\n" "and v17.16b, v20.16b, v23.16b\n" "and v16.16b, v19.16b, v23.16b\n" "sshr v18.4s, v18.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "srshl v10.4s, v10.4s, v23.4s\n" "sqadd v28.4s, v28.4s, v18.4s\n" "ldr q18, [%x[params], #0x100]\n" "sqadd v20.4s, v20.4s, v17.4s\n" "ldr q17, [%x[params], #0x110]\n" "sqadd v19.4s, v19.4s, v16.4s\n" "ldr q16, [%x[params], #0xf0]\n" "add v10.4s, v10.4s, v15.4s\n" "srshl v28.4s, v28.4s, v23.4s\n" "srshl v20.4s, v20.4s, v23.4s\n" "srshl v19.4s, v19.4s, v23.4s\n" "ldr q23, [%x[params], #0x130]\n" "smax v10.4s, v10.4s, v8.4s\n" "add v28.4s, v28.4s, v15.4s\n" "add v20.4s, v20.4s, v15.4s\n" "add v19.4s, v19.4s, v15.4s\n" "smin v10.4s, v10.4s, v12.4s\n" "smax v28.4s, v28.4s, v8.4s\n" "smax v20.4s, v20.4s, v8.4s\n" "smax v19.4s, v19.4s, v8.4s\n" "smin v28.4s, v28.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "smin v19.4s, v19.4s, v12.4s\n" "uzp1 v10.16b, v10.16b, v10.16b\n" "uzp1 v10.16b, v10.16b, v10.16b\n" "uzp1 v28.16b, v28.16b, v28.16b\n" "str s10, [x25, x11]\n" "ldr q22, [%x[params], #0xe0]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v19.16b, v19.16b, v19.16b\n" "add %x[params], %x[params], #0x140\n" "uzp1 v28.16b, v28.16b, v28.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "str s28, [x24, x11]\n" "uzp1 v19.16b, v19.16b, v19.16b\n" "str s20, [x23, x11]\n" "mov v21.16b, v22.16b\n" "str s19, [x22, x11]\n" "mov v20.16b, v22.16b\n" "mov v19.16b, v22.16b\n" ".inst 0x4e8e9616 // sdot v22.4s, v16.16b, v14.16b\n" ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n" ".inst 0x4e9b9656 // sdot v22.4s, v18.16b, v27.16b\n" "ext v14.16b, v14.16b, v14.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x4e8e9615 // sdot v21.4s, v16.16b, v14.16b\n" ".inst 0x4e9b9613 // sdot v19.4s, v16.16b, v27.16b\n" ".inst 0x4e879654 // sdot v20.4s, v18.16b, v7.16b\n" ".inst 0x4e879636 // sdot v22.4s, v17.16b, v7.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" ".inst 0x4e9b9655 // sdot v21.4s, v18.16b, v27.16b\n" ".inst 0x4e879653 // sdot v19.4s, v18.16b, v7.16b\n" "sqrdmulh v22.4s, v22.4s, v24.4s\n" ".inst 0x4e819634 // sdot v20.4s, v17.16b, v1.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n" ".inst 0x4e819633 // sdot v19.4s, v17.16b, v1.16b\n" "and v16.16b, v22.16b, v23.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v21.4s, v21.4s, v24.4s\n" "sqrdmulh v20.4s, v20.4s, v24.4s\n" "sqrdmulh v19.4s, v19.4s, v24.4s\n" "sqadd v22.4s, v22.4s, v16.4s\n" "and v18.16b, v21.16b, v23.16b\n" "and v17.16b, v20.16b, v23.16b\n" "and v16.16b, v19.16b, v23.16b\n" "sshr v18.4s, v18.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqadd v21.4s, v21.4s, v18.4s\n" "sqadd v20.4s, v20.4s, v17.4s\n" "sqadd v19.4s, v19.4s, v16.4s\n" "srshl v22.4s, v22.4s, v23.4s\n" "srshl v21.4s, v21.4s, v23.4s\n" "srshl v20.4s, v20.4s, v23.4s\n" "srshl v19.4s, v19.4s, v23.4s\n" "add v22.4s, v22.4s, v15.4s\n" "add v21.4s, v21.4s, v15.4s\n" "add v20.4s, v20.4s, v15.4s\n" "add v19.4s, v19.4s, v15.4s\n" "smax v22.4s, v22.4s, v8.4s\n" "smax v21.4s, v21.4s, v8.4s\n" "smax v20.4s, v20.4s, v8.4s\n" "smax v19.4s, v19.4s, v8.4s\n" "smin v22.4s, v22.4s, v12.4s\n" "smin v21.4s, v21.4s, v12.4s\n" "smin v20.4s, v20.4s, v12.4s\n" "smin v19.4s, v19.4s, v12.4s\n" "uzp1 v22.16b, v22.16b, v22.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v19.16b, v19.16b, v19.16b\n" "uzp1 v22.16b, v22.16b, v22.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "str s22, [x25, x11]\n" "uzp1 v20.16b, v20.16b, v20.16b\n" "uzp1 v19.16b, v19.16b, v19.16b\n" "str s21, [x24, x11]\n" "str s20, [x23, x11]\n" "str s19, [x22, x11]\n" "add x11, x11, #0x4\n" "beq 35f\n" "3:" // Oddments "and x20, %x[n_channels], #0xf\n" "add x14, x14, x12\n" "add x13, x13, x12\n" "add x10, x10, x12\n" "add x9, x9, x12\n" "add x28, x28, x12\n" "add x27, x27, x12\n" "add x26, x26, x12\n" "add x21, x21, x12\n" "tbz %x[n_channels], #3, 7f\n" "ldr d11, [x14], #0x8\n" "ldr d10, [x13], #0x8\n" "ldr d3, [x10], #0x8\n" "ldr d14, [x9], #0x8\n" "ldr d13, [x28], #0x8\n" "ldr d28, [x27], #0x8\n" "ldr d21, [x26], #0x8\n" "ldr d27, [x21], #0x8\n" "tbz %x[n_channels], #2, 5f\n" "ld1 { v11.s }[2], [x14], #0x4\n" "ld1 { v10.s }[2], [x13], #0x4\n" "ld1 { v3.s }[2], [x10], #0x4\n" "ld1 { v14.s }[2], [x9], #0x4\n" "ld1 { v13.s }[2], [x28], #0x4\n" "ld1 { v28.s }[2], [x27], #0x4\n" "ld1 { v21.s }[2], [x26], #0x4\n" "ld1 { v27.s }[2], [x21], #0x4\n" "tbz %x[n_channels], #1, 4f\n" "ld1 { v11.h }[6], [x14], #0x2\n" "ld1 { v10.h }[6], [x13], #0x2\n" "ld1 { v3.h }[6], [x10], #0x2\n" "ld1 { v14.h }[6], [x9], #0x2\n" "ld1 { v13.h }[6], [x28], #0x2\n" "ld1 { v28.h }[6], [x27], #0x2\n" "ld1 { v21.h }[6], [x26], #0x2\n" "ld1 { v27.h }[6], [x21], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[14], [x14], #0x1\n" "ld1 { v10.b }[14], [x13], #0x1\n" "ld1 { v3.b }[14], [x10], #0x1\n" "ld1 { v14.b }[14], [x9], #0x1\n" "ld1 { v13.b }[14], [x28], #0x1\n" "ld1 { v28.b }[14], [x27], #0x1\n" "ld1 { v21.b }[14], [x26], #0x1\n" "ld1 { v27.b }[14], [x21], #0x1\n" "b 11f\n" "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[12], [x14], #0x1\n" "ld1 { v10.b }[12], [x13], #0x1\n" "ld1 { v3.b }[12], [x10], #0x1\n" "ld1 { v14.b }[12], [x9], #0x1\n" "ld1 { v13.b }[12], [x28], #0x1\n" "ld1 { v28.b }[12], [x27], #0x1\n" "ld1 { v21.b }[12], [x26], #0x1\n" "ld1 { v27.b }[12], [x21], #0x1\n" "b 11f\n" "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 6f\n" "ld1 { v11.h }[4], [x14], #0x2\n" "ld1 { v10.h }[4], [x13], #0x2\n" "ld1 { v3.h }[4], [x10], #0x2\n" "ld1 { v14.h }[4], [x9], #0x2\n" "ld1 { v13.h }[4], [x28], #0x2\n" "ld1 { v28.h }[4], [x27], #0x2\n" "ld1 { v21.h }[4], [x26], #0x2\n" "ld1 { v27.h }[4], [x21], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[10], [x14], #0x1\n" "ld1 { v10.b }[10], [x13], #0x1\n" "ld1 { v3.b }[10], [x10], #0x1\n" "ld1 { v14.b }[10], [x9], #0x1\n" "ld1 { v13.b }[10], [x28], #0x1\n" "ld1 { v28.b }[10], [x27], #0x1\n" "ld1 { v21.b }[10], [x26], #0x1\n" "ld1 { v27.b }[10], [x21], #0x1\n" "b 11f\n" "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[8], [x14], #0x1\n" "ld1 { v10.b }[8], [x13], #0x1\n" "ld1 { v3.b }[8], [x10], #0x1\n" "ld1 { v14.b }[8], [x9], #0x1\n" "ld1 { v13.b }[8], [x28], #0x1\n" "ld1 { v28.b }[8], [x27], #0x1\n" "ld1 { v21.b }[8], [x26], #0x1\n" "ld1 { v27.b }[8], [x21], #0x1\n" "b 11f\n" "7:" // Oddments: Load (A): Bit 3: Unset "tbz %x[n_channels], #2, 9f\n" "ldr s11, [x14], #0x4\n" "ldr s10, [x13], #0x4\n" "ldr s3, [x10], #0x4\n" "ldr s14, [x9], #0x4\n" "ldr s13, [x28], #0x4\n" "ldr s28, [x27], #0x4\n" "ldr s21, [x26], #0x4\n" "ldr s27, [x21], #0x4\n" "tbz %x[n_channels], #1, 8f\n" "ld1 { v11.h }[2], [x14], #0x2\n" "ld1 { v10.h }[2], [x13], #0x2\n" "ld1 { v3.h }[2], [x10], #0x2\n" "ld1 { v14.h }[2], [x9], #0x2\n" "ld1 { v13.h }[2], [x28], #0x2\n" "ld1 { v28.h }[2], [x27], #0x2\n" "ld1 { v21.h }[2], [x26], #0x2\n" "ld1 { v27.h }[2], [x21], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[6], [x14], #0x1\n" "ld1 { v10.b }[6], [x13], #0x1\n" "ld1 { v3.b }[6], [x10], #0x1\n" "ld1 { v14.b }[6], [x9], #0x1\n" "ld1 { v13.b }[6], [x28], #0x1\n" "ld1 { v28.b }[6], [x27], #0x1\n" "ld1 { v21.b }[6], [x26], #0x1\n" "ld1 { v27.b }[6], [x21], #0x1\n" "b 11f\n" "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[4], [x14], #0x1\n" "ld1 { v10.b }[4], [x13], #0x1\n" "ld1 { v3.b }[4], [x10], #0x1\n" "ld1 { v14.b }[4], [x9], #0x1\n" "ld1 { v13.b }[4], [x28], #0x1\n" "ld1 { v28.b }[4], [x27], #0x1\n" "ld1 { v21.b }[4], [x26], #0x1\n" "ld1 { v27.b }[4], [x21], #0x1\n" "b 11f\n" "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 10f\n" "ldr h11, [x14], #0x2\n" "ldr h10, [x13], #0x2\n" "ldr h3, [x10], #0x2\n" "ldr h14, [x9], #0x2\n" "ldr h13, [x28], #0x2\n" "ldr h28, [x27], #0x2\n" "ldr h21, [x26], #0x2\n" "ldr h27, [x21], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v11.b }[2], [x14], #0x1\n" "ld1 { v10.b }[2], [x13], #0x1\n" "ld1 { v3.b }[2], [x10], #0x1\n" "ld1 { v14.b }[2], [x9], #0x1\n" "ld1 { v13.b }[2], [x28], #0x1\n" "ld1 { v28.b }[2], [x27], #0x1\n" "ld1 { v21.b }[2], [x26], #0x1\n" "ld1 { v27.b }[2], [x21], #0x1\n" "b 11f\n" "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b11, [x14], #0x1\n" "ldr b10, [x13], #0x1\n" "ldr b3, [x10], #0x1\n" "ldr b14, [x9], #0x1\n" "ldr b13, [x28], #0x1\n" "ldr b28, [x27], #0x1\n" "ldr b21, [x26], #0x1\n" "ldr b27, [x21], #0x1\n" "11:" // Oddments: Load (A): Bit 3: End "ldp x14, x13, [%x[inptrs], #0x40]\n" "ldp x10, x9, [%x[inptrs], #0x50]\n" "add x14, x14, x12\n" "add x13, x13, x12\n" "ldp x28, x27, [%x[inptrs], #0x60]\n" "ldp x26, x21, [%x[inptrs], #0x70]\n" "add x10, x10, x12\n" "add x9, x9, x12\n" "add x28, x28, x12\n" "add x27, x27, x12\n" "add x26, x26, x12\n" "add x21, x21, x12\n" "tbz %x[n_channels], #3, 15f\n" "ldr d5, [x14], #0x8\n" "ldr d29, [x13], #0x8\n" "ldr d0, [x10], #0x8\n" "ldr d7, [x9], #0x8\n" "ldr d16, [x28], #0x8\n" "ldr d30, [x27], #0x8\n" "ldr d2, [x26], #0x8\n" "ldr d1, [x21], #0x8\n" "tbz %x[n_channels], #2, 13f\n" "ld1 { v5.s }[2], [x14], #0x4\n" "ld1 { v29.s }[2], [x13], #0x4\n" "ld1 { v0.s }[2], [x10], #0x4\n" "ld1 { v7.s }[2], [x9], #0x4\n" "ld1 { v16.s }[2], [x28], #0x4\n" "ld1 { v30.s }[2], [x27], #0x4\n" "ld1 { v2.s }[2], [x26], #0x4\n" "ld1 { v1.s }[2], [x21], #0x4\n" "tbz %x[n_channels], #1, 12f\n" "ld1 { v5.h }[6], [x14], #0x2\n" "ld1 { v29.h }[6], [x13], #0x2\n" "ld1 { v0.h }[6], [x10], #0x2\n" "ld1 { v7.h }[6], [x9], #0x2\n" "ld1 { v16.h }[6], [x28], #0x2\n" "ld1 { v30.h }[6], [x27], #0x2\n" "ld1 { v2.h }[6], [x26], #0x2\n" "ld1 { v1.h }[6], [x21], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[14], [x14], #0x1\n" "ld1 { v29.b }[14], [x13], #0x1\n" "ld1 { v0.b }[14], [x10], #0x1\n" "ld1 { v7.b }[14], [x9], #0x1\n" "ld1 { v16.b }[14], [x28], #0x1\n" "ld1 { v30.b }[14], [x27], #0x1\n" "ld1 { v2.b }[14], [x26], #0x1\n" "ld1 { v1.b }[14], [x21], #0x1\n" "b 19f\n" "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[12], [x14], #0x1\n" "ld1 { v29.b }[12], [x13], #0x1\n" "ld1 { v0.b }[12], [x10], #0x1\n" "ld1 { v7.b }[12], [x9], #0x1\n" "ld1 { v16.b }[12], [x28], #0x1\n" "ld1 { v30.b }[12], [x27], #0x1\n" "ld1 { v2.b }[12], [x26], #0x1\n" "ld1 { v1.b }[12], [x21], #0x1\n" "b 19f\n" "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 14f\n" "ld1 { v5.h }[4], [x14], #0x2\n" "ld1 { v29.h }[4], [x13], #0x2\n" "ld1 { v0.h }[4], [x10], #0x2\n" "ld1 { v7.h }[4], [x9], #0x2\n" "ld1 { v16.h }[4], [x28], #0x2\n" "ld1 { v30.h }[4], [x27], #0x2\n" "ld1 { v2.h }[4], [x26], #0x2\n" "ld1 { v1.h }[4], [x21], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[10], [x14], #0x1\n" "ld1 { v29.b }[10], [x13], #0x1\n" "ld1 { v0.b }[10], [x10], #0x1\n" "ld1 { v7.b }[10], [x9], #0x1\n" "ld1 { v16.b }[10], [x28], #0x1\n" "ld1 { v30.b }[10], [x27], #0x1\n" "ld1 { v2.b }[10], [x26], #0x1\n" "ld1 { v1.b }[10], [x21], #0x1\n" "b 19f\n" "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[8], [x14], #0x1\n" "ld1 { v29.b }[8], [x13], #0x1\n" "ld1 { v0.b }[8], [x10], #0x1\n" "ld1 { v7.b }[8], [x9], #0x1\n" "ld1 { v16.b }[8], [x28], #0x1\n" "ld1 { v30.b }[8], [x27], #0x1\n" "ld1 { v2.b }[8], [x26], #0x1\n" "ld1 { v1.b }[8], [x21], #0x1\n" "b 19f\n" "15:" // Oddments: Load (B): Bit 3: Unset "tbz %x[n_channels], #2, 17f\n" "ldr s5, [x14], #0x4\n" "ldr s29, [x13], #0x4\n" "ldr s0, [x10], #0x4\n" "ldr s7, [x9], #0x4\n" "ldr s16, [x28], #0x4\n" "ldr s30, [x27], #0x4\n" "ldr s2, [x26], #0x4\n" "ldr s1, [x21], #0x4\n" "tbz %x[n_channels], #1, 16f\n" "ld1 { v5.h }[2], [x14], #0x2\n" "ld1 { v29.h }[2], [x13], #0x2\n" "ld1 { v0.h }[2], [x10], #0x2\n" "ld1 { v7.h }[2], [x9], #0x2\n" "ld1 { v16.h }[2], [x28], #0x2\n" "ld1 { v30.h }[2], [x27], #0x2\n" "ld1 { v2.h }[2], [x26], #0x2\n" "ld1 { v1.h }[2], [x21], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[6], [x14], #0x1\n" "ld1 { v29.b }[6], [x13], #0x1\n" "ld1 { v0.b }[6], [x10], #0x1\n" "ld1 { v7.b }[6], [x9], #0x1\n" "ld1 { v16.b }[6], [x28], #0x1\n" "ld1 { v30.b }[6], [x27], #0x1\n" "ld1 { v2.b }[6], [x26], #0x1\n" "ld1 { v1.b }[6], [x21], #0x1\n" "b 19f\n" "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[4], [x14], #0x1\n" "ld1 { v29.b }[4], [x13], #0x1\n" "ld1 { v0.b }[4], [x10], #0x1\n" "ld1 { v7.b }[4], [x9], #0x1\n" "ld1 { v16.b }[4], [x28], #0x1\n" "ld1 { v30.b }[4], [x27], #0x1\n" "ld1 { v2.b }[4], [x26], #0x1\n" "ld1 { v1.b }[4], [x21], #0x1\n" "b 19f\n" "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 18f\n" "ldr h5, [x14], #0x2\n" "ldr h29, [x13], #0x2\n" "ldr h0, [x10], #0x2\n" "ldr h7, [x9], #0x2\n" "ldr h16, [x28], #0x2\n" "ldr h30, [x27], #0x2\n" "ldr h2, [x26], #0x2\n" "ldr h1, [x21], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v5.b }[2], [x14], #0x1\n" "ld1 { v29.b }[2], [x13], #0x1\n" "ld1 { v0.b }[2], [x10], #0x1\n" "ld1 { v7.b }[2], [x9], #0x1\n" "ld1 { v16.b }[2], [x28], #0x1\n" "ld1 { v30.b }[2], [x27], #0x1\n" "ld1 { v2.b }[2], [x26], #0x1\n" "ld1 { v1.b }[2], [x21], #0x1\n" "b 19f\n" "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b5, [x14], #0x1\n" "ldr b29, [x13], #0x1\n" "ldr b0, [x10], #0x1\n" "ldr b7, [x9], #0x1\n" "ldr b16, [x28], #0x1\n" "ldr b30, [x27], #0x1\n" "ldr b2, [x26], #0x1\n" "ldr b1, [x21], #0x1\n" "19:" // Oddments: Load (B): Bit 3: End "ldr q25, [%x[params], #0x10]\n" "ldr q24, [%x[params], #0x20]\n" "zip2 v18.16b, v11.16b, v3.16b\n" "zip1 v11.16b, v11.16b, v3.16b\n" "ldr q23, [%x[params], #0x30]\n" "zip1 v17.16b, v10.16b, v14.16b\n" "zip2 v14.16b, v10.16b, v14.16b\n" "cmp x20, #0x4\n" "zip2 v10.16b, v11.16b, v17.16b\n" "zip1 v11.16b, v11.16b, v17.16b\n" "zip1 v3.16b, v18.16b, v14.16b\n" "zip2 v14.16b, v18.16b, v14.16b\n" "ldr q31, [%x[params], #0x0]\n" "zip2 v22.16b, v13.16b, v21.16b\n" "zip1 v13.16b, v13.16b, v21.16b\n" "zip1 v21.16b, v28.16b, v27.16b\n" "zip2 v27.16b, v28.16b, v27.16b\n" "zip2 v20.16b, v5.16b, v0.16b\n" "zip1 v5.16b, v5.16b, v0.16b\n" "zip1 v19.16b, v29.16b, v7.16b\n" "zip2 v7.16b, v29.16b, v7.16b\n" "zip2 v18.16b, v16.16b, v2.16b\n" "zip1 v16.16b, v16.16b, v2.16b\n" "zip1 v17.16b, v30.16b, v1.16b\n" "zip2 v1.16b, v30.16b, v1.16b\n" "zip2 v28.16b, v13.16b, v21.16b\n" "zip1 v13.16b, v13.16b, v21.16b\n" "zip1 v21.16b, v22.16b, v27.16b\n" "zip2 v27.16b, v22.16b, v27.16b\n" "zip2 v29.16b, v5.16b, v19.16b\n" "zip1 v5.16b, v5.16b, v19.16b\n" "zip1 v0.16b, v20.16b, v7.16b\n" "zip2 v7.16b, v20.16b, v7.16b\n" "zip2 v30.16b, v16.16b, v17.16b\n" "zip1 v16.16b, v16.16b, v17.16b\n" "zip1 v2.16b, v18.16b, v1.16b\n" "zip2 v1.16b, v18.16b, v1.16b\n" "mov v26.16b, v31.16b\n" "mov v18.16b, v31.16b\n" ".inst 0x4e8d9732 // sdot v18.4s, v25.16b, v13.16b\n" "mov v4.16b, v31.16b\n" ".inst 0x4e8b973f // sdot v31.4s, v25.16b, v11.16b\n" ".inst 0x4e8d971f // sdot v31.4s, v24.16b, v13.16b\n" "ext v11.16b, v11.16b, v11.16b, #0x1\n" "ext v13.16b, v13.16b, v13.16b, #0x1\n" ".inst 0x4e8b973a // sdot v26.4s, v25.16b, v11.16b\n" "ldr q17, [%x[params], #0x40]\n" ".inst 0x4e8d9724 // sdot v4.4s, v25.16b, v13.16b\n" ".inst 0x4e859712 // sdot v18.4s, v24.16b, v5.16b\n" ".inst 0x4e8596ff // sdot v31.4s, v23.16b, v5.16b\n" "ext v5.16b, v5.16b, v5.16b, #0x1\n" ".inst 0x4e8d971a // sdot v26.4s, v24.16b, v13.16b\n" "ldr q20, [%x[params], #0x50]\n" ".inst 0x4e859704 // sdot v4.4s, v24.16b, v5.16b\n" ".inst 0x4e9096f2 // sdot v18.4s, v23.16b, v16.16b\n" "ext v16.16b, v16.16b, v16.16b, #0x1\n" "add %x[params], %x[params], #0x60\n" "sqrdmulh v31.4s, v31.4s, v17.4s\n" ".inst 0x4e8596fa // sdot v26.4s, v23.16b, v5.16b\n" ".inst 0x4e9096e4 // sdot v4.4s, v23.16b, v16.16b\n" "and v16.16b, v31.16b, v20.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v17.4s\n" "sqrdmulh v18.4s, v18.4s, v17.4s\n" "sqrdmulh v4.4s, v4.4s, v17.4s\n" "sqadd v31.4s, v31.4s, v16.4s\n" "and v19.16b, v26.16b, v20.16b\n" "and v17.16b, v18.16b, v20.16b\n" "and v16.16b, v4.16b, v20.16b\n" "sshr v19.4s, v19.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v19.4s\n" "sqadd v18.4s, v18.4s, v17.4s\n" "sqadd v4.4s, v4.4s, v16.4s\n" "srshl v31.4s, v31.4s, v20.4s\n" "srshl v26.4s, v26.4s, v20.4s\n" "srshl v18.4s, v18.4s, v20.4s\n" "srshl v4.4s, v4.4s, v20.4s\n" "add v31.4s, v31.4s, v15.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v18.4s, v18.4s, v15.4s\n" "add v4.4s, v4.4s, v15.4s\n" "smax v31.4s, v31.4s, v8.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v18.4s, v18.4s, v8.4s\n" "smax v4.4s, v4.4s, v8.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v18.4s, v18.4s, v12.4s\n" "smin v4.4s, v4.4s, v12.4s\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "blt 20f\n" "str s31, [x25, x11]\n" "str s26, [x24, x11]\n" "str s18, [x23, x11]\n" "str s4, [x22, x11]\n" "b 23f\n" "20:" // Oddments: Unroll 0: Oddment store "add x25, x25, x11\n" "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "tbz x20, #1, 21f\n" "st1 { v31.h }[0], [x25], #0x2\n" "st1 { v26.h }[0], [x24], #0x2\n" "st1 { v18.h }[0], [x23], #0x2\n" "st1 { v4.h }[0], [x22], #0x2\n" "tbz x20, #0, 22f\n" "st1 { v31.b }[2], [x25], #0x1\n" "st1 { v26.b }[2], [x24], #0x1\n" "st1 { v18.b }[2], [x23], #0x1\n" "st1 { v4.b }[2], [x22], #0x1\n" "b 22f\n" "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset "st1 { v31.b }[0], [x25], #0x1\n" "st1 { v26.b }[0], [x24], #0x1\n" "st1 { v18.b }[0], [x23], #0x1\n" "st1 { v4.b }[0], [x22], #0x1\n" "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End "23:" // Oddments: Unroll 0: After oddment store "subs x20, x20, #0x4\n" "add x11, x11, #0x4\n" "ble 35f\n" "ldr q31, [%x[params], #0x0]\n" "ldr q23, [%x[params], #0x10]\n" "mov v26.16b, v31.16b\n" "mov v18.16b, v31.16b\n" "ldr q22, [%x[params], #0x20]\n" "ldr q16, [%x[params], #0x30]\n" "mov v4.16b, v31.16b\n" ".inst 0x4e8a96ff // sdot v31.4s, v23.16b, v10.16b\n" "ldr q17, [%x[params], #0x40]\n" "ldr q20, [%x[params], #0x50]\n" ".inst 0x4e9c96f2 // sdot v18.4s, v23.16b, v28.16b\n" ".inst 0x4e9c96df // sdot v31.4s, v22.16b, v28.16b\n" "ext v10.16b, v10.16b, v10.16b, #0x1\n" "ext v28.16b, v28.16b, v28.16b, #0x1\n" ".inst 0x4e8a96fa // sdot v26.4s, v23.16b, v10.16b\n" "cmp x20, #0x4\n" ".inst 0x4e9c96e4 // sdot v4.4s, v23.16b, v28.16b\n" ".inst 0x4e9d96d2 // sdot v18.4s, v22.16b, v29.16b\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e9d961f // sdot v31.4s, v16.16b, v29.16b\n" "ext v29.16b, v29.16b, v29.16b, #0x1\n" ".inst 0x4e9c96da // sdot v26.4s, v22.16b, v28.16b\n" ".inst 0x4e9d96c4 // sdot v4.4s, v22.16b, v29.16b\n" ".inst 0x4e9e9612 // sdot v18.4s, v16.16b, v30.16b\n" "ext v30.16b, v30.16b, v30.16b, #0x1\n" "sqrdmulh v31.4s, v31.4s, v17.4s\n" ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n" ".inst 0x4e9e9604 // sdot v4.4s, v16.16b, v30.16b\n" "and v16.16b, v31.16b, v20.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v17.4s\n" "sqrdmulh v18.4s, v18.4s, v17.4s\n" "sqrdmulh v4.4s, v4.4s, v17.4s\n" "sqadd v31.4s, v31.4s, v16.4s\n" "and v19.16b, v26.16b, v20.16b\n" "and v17.16b, v18.16b, v20.16b\n" "and v16.16b, v4.16b, v20.16b\n" "sshr v19.4s, v19.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v19.4s\n" "sqadd v18.4s, v18.4s, v17.4s\n" "sqadd v4.4s, v4.4s, v16.4s\n" "srshl v31.4s, v31.4s, v20.4s\n" "srshl v26.4s, v26.4s, v20.4s\n" "srshl v18.4s, v18.4s, v20.4s\n" "srshl v4.4s, v4.4s, v20.4s\n" "add v31.4s, v31.4s, v15.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v18.4s, v18.4s, v15.4s\n" "add v4.4s, v4.4s, v15.4s\n" "smax v31.4s, v31.4s, v8.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v18.4s, v18.4s, v8.4s\n" "smax v4.4s, v4.4s, v8.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v18.4s, v18.4s, v12.4s\n" "smin v4.4s, v4.4s, v12.4s\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "blt 24f\n" "str s31, [x25, x11]\n" "str s26, [x24, x11]\n" "str s18, [x23, x11]\n" "str s4, [x22, x11]\n" "b 27f\n" "24:" // Oddments: Unroll 1: Oddment store "add x25, x25, x11\n" "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "tbz x20, #1, 25f\n" "st1 { v31.h }[0], [x25], #0x2\n" "st1 { v26.h }[0], [x24], #0x2\n" "st1 { v18.h }[0], [x23], #0x2\n" "st1 { v4.h }[0], [x22], #0x2\n" "tbz x20, #0, 26f\n" "st1 { v31.b }[2], [x25], #0x1\n" "st1 { v26.b }[2], [x24], #0x1\n" "st1 { v18.b }[2], [x23], #0x1\n" "st1 { v4.b }[2], [x22], #0x1\n" "b 26f\n" "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset "st1 { v31.b }[0], [x25], #0x1\n" "st1 { v26.b }[0], [x24], #0x1\n" "st1 { v18.b }[0], [x23], #0x1\n" "st1 { v4.b }[0], [x22], #0x1\n" "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End "27:" // Oddments: Unroll 1: After oddment store "subs x20, x20, #0x4\n" "add x11, x11, #0x4\n" "ble 35f\n" "ldr q31, [%x[params], #0x0]\n" "ldr q23, [%x[params], #0x10]\n" "mov v26.16b, v31.16b\n" "mov v18.16b, v31.16b\n" "ldr q22, [%x[params], #0x20]\n" "ldr q16, [%x[params], #0x30]\n" "mov v4.16b, v31.16b\n" ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n" "ldr q17, [%x[params], #0x40]\n" "ldr q20, [%x[params], #0x50]\n" ".inst 0x4e9596f2 // sdot v18.4s, v23.16b, v21.16b\n" ".inst 0x4e9596df // sdot v31.4s, v22.16b, v21.16b\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "ext v21.16b, v21.16b, v21.16b, #0x1\n" ".inst 0x4e8396fa // sdot v26.4s, v23.16b, v3.16b\n" "cmp x20, #0x4\n" ".inst 0x4e9596e4 // sdot v4.4s, v23.16b, v21.16b\n" ".inst 0x4e8096d2 // sdot v18.4s, v22.16b, v0.16b\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e80961f // sdot v31.4s, v16.16b, v0.16b\n" "ext v0.16b, v0.16b, v0.16b, #0x1\n" ".inst 0x4e9596da // sdot v26.4s, v22.16b, v21.16b\n" ".inst 0x4e8096c4 // sdot v4.4s, v22.16b, v0.16b\n" ".inst 0x4e829612 // sdot v18.4s, v16.16b, v2.16b\n" "ext v2.16b, v2.16b, v2.16b, #0x1\n" "sqrdmulh v31.4s, v31.4s, v17.4s\n" ".inst 0x4e80961a // sdot v26.4s, v16.16b, v0.16b\n" ".inst 0x4e829604 // sdot v4.4s, v16.16b, v2.16b\n" "and v16.16b, v31.16b, v20.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v17.4s\n" "sqrdmulh v18.4s, v18.4s, v17.4s\n" "sqrdmulh v4.4s, v4.4s, v17.4s\n" "sqadd v31.4s, v31.4s, v16.4s\n" "and v19.16b, v26.16b, v20.16b\n" "and v17.16b, v18.16b, v20.16b\n" "and v16.16b, v4.16b, v20.16b\n" "sshr v19.4s, v19.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v19.4s\n" "sqadd v18.4s, v18.4s, v17.4s\n" "sqadd v4.4s, v4.4s, v16.4s\n" "srshl v31.4s, v31.4s, v20.4s\n" "srshl v26.4s, v26.4s, v20.4s\n" "srshl v18.4s, v18.4s, v20.4s\n" "srshl v4.4s, v4.4s, v20.4s\n" "add v31.4s, v31.4s, v15.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v18.4s, v18.4s, v15.4s\n" "add v4.4s, v4.4s, v15.4s\n" "smax v31.4s, v31.4s, v8.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v18.4s, v18.4s, v8.4s\n" "smax v4.4s, v4.4s, v8.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v18.4s, v18.4s, v12.4s\n" "smin v4.4s, v4.4s, v12.4s\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "blt 28f\n" "str s31, [x25, x11]\n" "str s26, [x24, x11]\n" "str s18, [x23, x11]\n" "str s4, [x22, x11]\n" "b 31f\n" "28:" // Oddments: Unroll 2: Oddment store "add x25, x25, x11\n" "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "tbz x20, #1, 29f\n" "st1 { v31.h }[0], [x25], #0x2\n" "st1 { v26.h }[0], [x24], #0x2\n" "st1 { v18.h }[0], [x23], #0x2\n" "st1 { v4.h }[0], [x22], #0x2\n" "tbz x20, #0, 30f\n" "st1 { v31.b }[2], [x25], #0x1\n" "st1 { v26.b }[2], [x24], #0x1\n" "st1 { v18.b }[2], [x23], #0x1\n" "st1 { v4.b }[2], [x22], #0x1\n" "b 30f\n" "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset "st1 { v31.b }[0], [x25], #0x1\n" "st1 { v26.b }[0], [x24], #0x1\n" "st1 { v18.b }[0], [x23], #0x1\n" "st1 { v4.b }[0], [x22], #0x1\n" "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End "31:" // Oddments: Unroll 2: After oddment store "subs x20, x20, #0x4\n" "add x11, x11, #0x4\n" "ble 35f\n" "ldr q31, [%x[params], #0x0]\n" "ldr q20, [%x[params], #0x10]\n" "mov v26.16b, v31.16b\n" "mov v18.16b, v31.16b\n" "ldr q19, [%x[params], #0x20]\n" "ldr q16, [%x[params], #0x30]\n" "mov v4.16b, v31.16b\n" ".inst 0x4e8e969f // sdot v31.4s, v20.16b, v14.16b\n" "ldr q17, [%x[params], #0x40]\n" "ldr q22, [%x[params], #0x50]\n" ".inst 0x4e9b9692 // sdot v18.4s, v20.16b, v27.16b\n" ".inst 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n" "ext v14.16b, v14.16b, v14.16b, #0x1\n" "ext v27.16b, v27.16b, v27.16b, #0x1\n" ".inst 0x4e8e969a // sdot v26.4s, v20.16b, v14.16b\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e9b9684 // sdot v4.4s, v20.16b, v27.16b\n" ".inst 0x4e879672 // sdot v18.4s, v19.16b, v7.16b\n" ".inst 0x4e87961f // sdot v31.4s, v16.16b, v7.16b\n" "ext v7.16b, v7.16b, v7.16b, #0x1\n" ".inst 0x4e9b967a // sdot v26.4s, v19.16b, v27.16b\n" ".inst 0x4e879664 // sdot v4.4s, v19.16b, v7.16b\n" ".inst 0x4e819612 // sdot v18.4s, v16.16b, v1.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" "sqrdmulh v31.4s, v31.4s, v17.4s\n" ".inst 0x4e87961a // sdot v26.4s, v16.16b, v7.16b\n" ".inst 0x4e819604 // sdot v4.4s, v16.16b, v1.16b\n" "and v16.16b, v31.16b, v22.16b\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v17.4s\n" "sqrdmulh v18.4s, v18.4s, v17.4s\n" "sqrdmulh v4.4s, v4.4s, v17.4s\n" "sqadd v31.4s, v31.4s, v16.4s\n" "and v23.16b, v26.16b, v22.16b\n" "and v17.16b, v18.16b, v22.16b\n" "and v16.16b, v4.16b, v22.16b\n" "sshr v23.4s, v23.4s, #0x1f\n" "sshr v17.4s, v17.4s, #0x1f\n" "sshr v16.4s, v16.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v23.4s\n" "sqadd v18.4s, v18.4s, v17.4s\n" "sqadd v4.4s, v4.4s, v16.4s\n" "srshl v31.4s, v31.4s, v22.4s\n" "srshl v26.4s, v26.4s, v22.4s\n" "srshl v18.4s, v18.4s, v22.4s\n" "srshl v4.4s, v4.4s, v22.4s\n" "add v31.4s, v31.4s, v15.4s\n" "add v26.4s, v26.4s, v15.4s\n" "add v18.4s, v18.4s, v15.4s\n" "add v4.4s, v4.4s, v15.4s\n" "smax v31.4s, v31.4s, v8.4s\n" "smax v26.4s, v26.4s, v8.4s\n" "smax v18.4s, v18.4s, v8.4s\n" "smax v4.4s, v4.4s, v8.4s\n" "smin v31.4s, v31.4s, v12.4s\n" "smin v26.4s, v26.4s, v12.4s\n" "smin v18.4s, v18.4s, v12.4s\n" "smin v4.4s, v4.4s, v12.4s\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "uzp1 v31.16b, v31.16b, v31.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v18.16b, v18.16b, v18.16b\n" "uzp1 v4.16b, v4.16b, v4.16b\n" "32:" // Oddments: Unroll 3: Oddment store "add x25, x25, x11\n" "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "tbz x20, #1, 33f\n" "st1 { v31.h }[0], [x25], #0x2\n" "st1 { v26.h }[0], [x24], #0x2\n" "st1 { v18.h }[0], [x23], #0x2\n" "st1 { v4.h }[0], [x22], #0x2\n" "tbz x20, #0, 34f\n" "st1 { v31.b }[2], [x25], #0x1\n" "st1 { v26.b }[2], [x24], #0x1\n" "st1 { v18.b }[2], [x23], #0x1\n" "st1 { v4.b }[2], [x22], #0x1\n" "b 34f\n" "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset "st1 { v31.b }[0], [x25], #0x1\n" "st1 { v26.b }[0], [x24], #0x1\n" "st1 { v18.b }[0], [x23], #0x1\n" "st1 { v4.b }[0], [x22], #0x1\n" "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End "35:" // End : [params] "+&r" (params) : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } } // namespace depthwise } // namespace arm_conv #endif // defined(__aarch64__)