/* * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(__aarch64__) #include "arm_gemm.hpp" #include namespace arm_conv { namespace depthwise { void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, // Bias, should be wrapped into the parameters const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, // Requant parameters, also wrapped int8_t *const *const outptrs ) { __asm__ __volatile__( "lsr x15, %x[n_channels], #0x4\n" "add x20, %x[qp], %[offsetof_Requantize32_minval]\n" "ld1r { v9.4s }, [x20]\n" "ldp x14, x13, [%x[inptrs], #0x0]\n" "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n" "ld1r { v8.4s }, [x20]\n" "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n" "ld1r { v7.4s }, [x20]\n" "mov x12, #0x0\n" "mov x11, #0x0\n" "ldp x10, x9, [%x[inptrs], #0x10]\n" "ldp x28, x27, [%x[inptrs], #0x20]\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "ldp x24, x23, [%x[outptrs], #0x0]\n" "ldp x22, x21, [%x[outptrs], #0x10]\n" "cbz x15, 3f\n" "ldr q6, [x14, x12]\n" "ldr q5, [x13, x12]\n" "subs x15, x15, #0x1\n" "ldr q4, [x10, x12]\n" "ldr q3, [x9, x12]\n" "zip2 v2.16b, v6.16b, v4.16b\n" "zip1 v6.16b, v6.16b, v4.16b\n" "ldr q1, [x28, x12]\n" "ldr q0, [x27, x12]\n" "zip1 v4.16b, v5.16b, v3.16b\n" "zip2 v3.16b, v5.16b, v3.16b\n" "ldr q31, [x26, x12]\n" "ldr q30, [x25, x12]\n" "zip2 v5.16b, v6.16b, v4.16b\n" "zip1 v6.16b, v6.16b, v4.16b\n" "ldr q29, [%x[params], #0x10]\n" "ldr q28, [%x[params], #0x20]\n" "zip1 v4.16b, v2.16b, v3.16b\n" "zip2 v3.16b, v2.16b, v3.16b\n" "ldr q2, [%x[params], #0x0]\n" "ldr q27, [%x[params], #0x30]\n" "zip2 v26.16b, v1.16b, v31.16b\n" "zip1 v1.16b, v1.16b, v31.16b\n" "ldp x14, x13, [%x[inptrs], #0x40]\n" "ldr q25, [x14, x12]\n" "zip1 v31.16b, v0.16b, v30.16b\n" "zip2 v30.16b, v0.16b, v30.16b\n" "ldr q24, [x13, x12]\n" "ldp x10, x9, [%x[inptrs], #0x50]\n" "zip2 v0.16b, v1.16b, v31.16b\n" "zip1 v1.16b, v1.16b, v31.16b\n" "ldr q23, [x10, x12]\n" "ldr q22, [x9, x12]\n" "zip2 v21.16b, v25.16b, v23.16b\n" "zip1 v25.16b, v25.16b, v23.16b\n" "ldp x28, x27, [%x[inptrs], #0x60]\n" "ldr q20, [x28, x12]\n" "zip1 v23.16b, v24.16b, v22.16b\n" "zip2 v22.16b, v24.16b, v22.16b\n" "ldr q19, [x27, x12]\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "zip1 v31.16b, v26.16b, v30.16b\n" "zip2 v30.16b, v26.16b, v30.16b\n" "ldr q18, [x26, x12]\n" "ldr q17, [x25, x12]\n" "zip2 v16.16b, v20.16b, v18.16b\n" "zip1 v20.16b, v20.16b, v18.16b\n" "zip1 v18.16b, v19.16b, v17.16b\n" "zip2 v17.16b, v19.16b, v17.16b\n" "ldp x14, x13, [%x[inptrs], #0x0]\n" "ldp x10, x9, [%x[inptrs], #0x10]\n" "ldp x28, x27, [%x[inptrs], #0x20]\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "zip2 v24.16b, v25.16b, v23.16b\n" "zip1 v25.16b, v25.16b, v23.16b\n" "zip1 v23.16b, v21.16b, v22.16b\n" "zip2 v22.16b, v21.16b, v22.16b\n" "add %x[params], %x[params], #0x40\n" "zip2 v19.16b, v20.16b, v18.16b\n" "zip1 v20.16b, v20.16b, v18.16b\n" "zip1 v18.16b, v16.16b, v17.16b\n" "zip2 v17.16b, v16.16b, v17.16b\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" "mov v16.16b, v2.16b\n" "beq 2f\n" "1:" // Loop ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n" ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "add x12, x12, #0x10\n" ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n" "ldr q6, [%x[params], #0x0]\n" ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n" ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n" "subs x15, x15, #0x1\n" ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n" "ext v25.16b, v25.16b, v25.16b, #0x1\n" ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n" "ldr q1, [%x[params], #0x10]\n" ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n" ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n" "ext v20.16b, v20.16b, v20.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n" ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [%x[params], #0x60]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0x40]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0x50]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0x30]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [%x[params], #0x70]\n" "smax v2.4s, v2.4s, v9.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s2, [x24, x11]\n" "ldr q2, [%x[params], #0x20]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "str s26, [x23, x11]\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "mov v26.16b, v2.16b\n" "str s16, [x21, x11]\n" "mov v21.16b, v2.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n" ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n" ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n" "ext v5.16b, v5.16b, v5.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v0.16b, v0.16b, v0.16b, #0x1\n" ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n" "ldr q5, [x13, x12]\n" ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n" ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n" ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n" "ldr q0, [x27, x12]\n" ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n" "ext v19.16b, v19.16b, v19.16b, #0x1\n" ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n" ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [%x[params], #0xc0]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0xa0]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0xb0]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0x90]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [%x[params], #0xd0]\n" "smax v2.4s, v2.4s, v9.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "str s2, [x24, x11]\n" "ldr q2, [%x[params], #0x80]\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s26, [x23, x11]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n" ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n" "add x11, x11, #0x4\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "ext v31.16b, v31.16b, v31.16b, #0x1\n" ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n" "ldr q4, [x10, x12]\n" ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n" ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n" ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n" "ldr q31, [x26, x12]\n" ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n" ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n" "ext v18.16b, v18.16b, v18.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n" ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [%x[params], #0x120]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0x100]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0x110]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0xf0]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [%x[params], #0x130]\n" "smax v2.4s, v2.4s, v9.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s2, [x24, x11]\n" "ldr q2, [%x[params], #0xe0]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "str s26, [x23, x11]\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "mov v26.16b, v2.16b\n" "str s16, [x21, x11]\n" "mov v21.16b, v2.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n" ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n" ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v30.16b, v30.16b, v30.16b, #0x1\n" ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n" "ldr q3, [x9, x12]\n" ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n" ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n" ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n" "ldr q30, [x25, x12]\n" ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n" "ext v17.16b, v17.16b, v17.16b, #0x1\n" ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n" ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [x14, x12]\n" "ldp x14, x13, [%x[inptrs], #0x40]\n" "ldr q25, [x14, x12]\n" "ldr q24, [x13, x12]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "ldp x10, x9, [%x[inptrs], #0x50]\n" "ldr q23, [x10, x12]\n" "ldr q22, [x9, x12]\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0x160]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0x170]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0x150]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [x28, x12]\n" "smax v2.4s, v2.4s, v9.4s\n" "ldp x28, x27, [%x[inptrs], #0x60]\n" "ldr q20, [x28, x12]\n" "ldr q19, [x27, x12]\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "ldr q18, [x26, x12]\n" "ldr q17, [x25, x12]\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "ldp x14, x13, [%x[inptrs], #0x0]\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "ldp x10, x9, [%x[inptrs], #0x10]\n" "ldp x28, x27, [%x[inptrs], #0x20]\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "ldp x26, x25, [%x[inptrs], #0x30]\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "str s2, [x24, x11]\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "zip2 v2.16b, v6.16b, v4.16b\n" "zip1 v6.16b, v6.16b, v4.16b\n" "zip1 v4.16b, v5.16b, v3.16b\n" "zip2 v3.16b, v5.16b, v3.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s26, [x23, x11]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "zip2 v5.16b, v6.16b, v4.16b\n" "zip1 v6.16b, v6.16b, v4.16b\n" "add x11, x11, #0x4\n" "zip1 v4.16b, v2.16b, v3.16b\n" "zip2 v3.16b, v2.16b, v3.16b\n" "ldr q2, [%x[params], #0x140]\n" "add %x[params], %x[params], #0x180\n" "zip2 v26.16b, v1.16b, v31.16b\n" "zip1 v1.16b, v1.16b, v31.16b\n" "zip1 v31.16b, v0.16b, v30.16b\n" "zip2 v30.16b, v0.16b, v30.16b\n" "zip2 v21.16b, v25.16b, v23.16b\n" "zip1 v25.16b, v25.16b, v23.16b\n" "zip1 v23.16b, v24.16b, v22.16b\n" "zip2 v22.16b, v24.16b, v22.16b\n" "zip2 v16.16b, v20.16b, v18.16b\n" "zip1 v20.16b, v20.16b, v18.16b\n" "zip1 v18.16b, v19.16b, v17.16b\n" "zip2 v17.16b, v19.16b, v17.16b\n" "zip2 v0.16b, v1.16b, v31.16b\n" "zip1 v1.16b, v1.16b, v31.16b\n" "zip1 v31.16b, v26.16b, v30.16b\n" "zip2 v30.16b, v26.16b, v30.16b\n" "zip2 v24.16b, v25.16b, v23.16b\n" "zip1 v25.16b, v25.16b, v23.16b\n" "zip1 v23.16b, v21.16b, v22.16b\n" "zip2 v22.16b, v21.16b, v22.16b\n" "zip2 v19.16b, v20.16b, v18.16b\n" "zip1 v20.16b, v20.16b, v18.16b\n" "zip1 v18.16b, v16.16b, v17.16b\n" "zip2 v17.16b, v16.16b, v17.16b\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" "mov v16.16b, v2.16b\n" "bgt 1b\n" "2:" // Detached iteration ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n" ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "tst %x[n_channels], #0xf\n" ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n" "ldr q6, [%x[params], #0x0]\n" ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n" ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n" "add x12, x12, #0x10\n" ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n" "ext v25.16b, v25.16b, v25.16b, #0x1\n" ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n" "ldr q1, [%x[params], #0x10]\n" ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n" ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n" "ext v20.16b, v20.16b, v20.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n" ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [%x[params], #0x60]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0x40]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0x50]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0x30]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [%x[params], #0x70]\n" "smax v2.4s, v2.4s, v9.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s2, [x24, x11]\n" "ldr q2, [%x[params], #0x20]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "str s26, [x23, x11]\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "mov v26.16b, v2.16b\n" "str s16, [x21, x11]\n" "mov v21.16b, v2.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n" ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n" ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n" "ext v5.16b, v5.16b, v5.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v0.16b, v0.16b, v0.16b, #0x1\n" ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n" ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n" ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n" ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n" ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n" "ext v19.16b, v19.16b, v19.16b, #0x1\n" ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n" ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [%x[params], #0xc0]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0xa0]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0xb0]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0x90]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [%x[params], #0xd0]\n" "smax v2.4s, v2.4s, v9.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "str s2, [x24, x11]\n" "ldr q2, [%x[params], #0x80]\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s26, [x23, x11]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n" ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n" "add x11, x11, #0x4\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "ext v31.16b, v31.16b, v31.16b, #0x1\n" ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n" ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n" ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n" ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n" ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n" ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n" "ext v18.16b, v18.16b, v18.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n" ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "ldr q6, [%x[params], #0x120]\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "srshl v2.4s, v2.4s, v1.4s\n" "sqadd v26.4s, v26.4s, v28.4s\n" "ldr q28, [%x[params], #0x100]\n" "sqadd v21.4s, v21.4s, v27.4s\n" "ldr q27, [%x[params], #0x110]\n" "sqadd v16.4s, v16.4s, v29.4s\n" "ldr q29, [%x[params], #0xf0]\n" "add v2.4s, v2.4s, v7.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "ldr q1, [%x[params], #0x130]\n" "smax v2.4s, v2.4s, v9.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s2, [x24, x11]\n" "ldr q2, [%x[params], #0xe0]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "add %x[params], %x[params], #0x140\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "str s26, [x23, x11]\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s21, [x22, x11]\n" "mov v26.16b, v2.16b\n" "str s16, [x21, x11]\n" "mov v21.16b, v2.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n" ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n" ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "add x11, x11, #0x4\n" "ext v30.16b, v30.16b, v30.16b, #0x1\n" ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n" ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n" ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n" ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n" ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n" "ext v17.16b, v17.16b, v17.16b, #0x1\n" ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n" ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v28.4s\n" "sqadd v21.4s, v21.4s, v27.4s\n" "sqadd v16.4s, v16.4s, v29.4s\n" "srshl v2.4s, v2.4s, v1.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "add v2.4s, v2.4s, v7.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smax v2.4s, v2.4s, v9.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "str s2, [x24, x11]\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "str s26, [x23, x11]\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "add x11, x11, #0x4\n" "beq 35f\n" "3:" // Oddments "and x20, %x[n_channels], #0xf\n" "add x14, x14, x12\n" "add x13, x13, x12\n" "add x10, x10, x12\n" "add x9, x9, x12\n" "add x28, x28, x12\n" "add x27, x27, x12\n" "add x26, x26, x12\n" "add x25, x25, x12\n" "tbz %x[n_channels], #3, 7f\n" "ldr d6, [x14], #0x8\n" "ldr d5, [x13], #0x8\n" "ldr d4, [x10], #0x8\n" "ldr d3, [x9], #0x8\n" "ldr d1, [x28], #0x8\n" "ldr d0, [x27], #0x8\n" "ldr d31, [x26], #0x8\n" "ldr d30, [x25], #0x8\n" "tbz %x[n_channels], #2, 5f\n" "ld1 { v6.s }[2], [x14], #0x4\n" "ld1 { v5.s }[2], [x13], #0x4\n" "ld1 { v4.s }[2], [x10], #0x4\n" "ld1 { v3.s }[2], [x9], #0x4\n" "ld1 { v1.s }[2], [x28], #0x4\n" "ld1 { v0.s }[2], [x27], #0x4\n" "ld1 { v31.s }[2], [x26], #0x4\n" "ld1 { v30.s }[2], [x25], #0x4\n" "tbz %x[n_channels], #1, 4f\n" "ld1 { v6.h }[6], [x14], #0x2\n" "ld1 { v5.h }[6], [x13], #0x2\n" "ld1 { v4.h }[6], [x10], #0x2\n" "ld1 { v3.h }[6], [x9], #0x2\n" "ld1 { v1.h }[6], [x28], #0x2\n" "ld1 { v0.h }[6], [x27], #0x2\n" "ld1 { v31.h }[6], [x26], #0x2\n" "ld1 { v30.h }[6], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[14], [x14], #0x1\n" "ld1 { v5.b }[14], [x13], #0x1\n" "ld1 { v4.b }[14], [x10], #0x1\n" "ld1 { v3.b }[14], [x9], #0x1\n" "ld1 { v1.b }[14], [x28], #0x1\n" "ld1 { v0.b }[14], [x27], #0x1\n" "ld1 { v31.b }[14], [x26], #0x1\n" "ld1 { v30.b }[14], [x25], #0x1\n" "b 11f\n" "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[12], [x14], #0x1\n" "ld1 { v5.b }[12], [x13], #0x1\n" "ld1 { v4.b }[12], [x10], #0x1\n" "ld1 { v3.b }[12], [x9], #0x1\n" "ld1 { v1.b }[12], [x28], #0x1\n" "ld1 { v0.b }[12], [x27], #0x1\n" "ld1 { v31.b }[12], [x26], #0x1\n" "ld1 { v30.b }[12], [x25], #0x1\n" "b 11f\n" "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 6f\n" "ld1 { v6.h }[4], [x14], #0x2\n" "ld1 { v5.h }[4], [x13], #0x2\n" "ld1 { v4.h }[4], [x10], #0x2\n" "ld1 { v3.h }[4], [x9], #0x2\n" "ld1 { v1.h }[4], [x28], #0x2\n" "ld1 { v0.h }[4], [x27], #0x2\n" "ld1 { v31.h }[4], [x26], #0x2\n" "ld1 { v30.h }[4], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[10], [x14], #0x1\n" "ld1 { v5.b }[10], [x13], #0x1\n" "ld1 { v4.b }[10], [x10], #0x1\n" "ld1 { v3.b }[10], [x9], #0x1\n" "ld1 { v1.b }[10], [x28], #0x1\n" "ld1 { v0.b }[10], [x27], #0x1\n" "ld1 { v31.b }[10], [x26], #0x1\n" "ld1 { v30.b }[10], [x25], #0x1\n" "b 11f\n" "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[8], [x14], #0x1\n" "ld1 { v5.b }[8], [x13], #0x1\n" "ld1 { v4.b }[8], [x10], #0x1\n" "ld1 { v3.b }[8], [x9], #0x1\n" "ld1 { v1.b }[8], [x28], #0x1\n" "ld1 { v0.b }[8], [x27], #0x1\n" "ld1 { v31.b }[8], [x26], #0x1\n" "ld1 { v30.b }[8], [x25], #0x1\n" "b 11f\n" "7:" // Oddments: Load (A): Bit 3: Unset "tbz %x[n_channels], #2, 9f\n" "ldr s6, [x14], #0x4\n" "ldr s5, [x13], #0x4\n" "ldr s4, [x10], #0x4\n" "ldr s3, [x9], #0x4\n" "ldr s1, [x28], #0x4\n" "ldr s0, [x27], #0x4\n" "ldr s31, [x26], #0x4\n" "ldr s30, [x25], #0x4\n" "tbz %x[n_channels], #1, 8f\n" "ld1 { v6.h }[2], [x14], #0x2\n" "ld1 { v5.h }[2], [x13], #0x2\n" "ld1 { v4.h }[2], [x10], #0x2\n" "ld1 { v3.h }[2], [x9], #0x2\n" "ld1 { v1.h }[2], [x28], #0x2\n" "ld1 { v0.h }[2], [x27], #0x2\n" "ld1 { v31.h }[2], [x26], #0x2\n" "ld1 { v30.h }[2], [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[6], [x14], #0x1\n" "ld1 { v5.b }[6], [x13], #0x1\n" "ld1 { v4.b }[6], [x10], #0x1\n" "ld1 { v3.b }[6], [x9], #0x1\n" "ld1 { v1.b }[6], [x28], #0x1\n" "ld1 { v0.b }[6], [x27], #0x1\n" "ld1 { v31.b }[6], [x26], #0x1\n" "ld1 { v30.b }[6], [x25], #0x1\n" "b 11f\n" "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[4], [x14], #0x1\n" "ld1 { v5.b }[4], [x13], #0x1\n" "ld1 { v4.b }[4], [x10], #0x1\n" "ld1 { v3.b }[4], [x9], #0x1\n" "ld1 { v1.b }[4], [x28], #0x1\n" "ld1 { v0.b }[4], [x27], #0x1\n" "ld1 { v31.b }[4], [x26], #0x1\n" "ld1 { v30.b }[4], [x25], #0x1\n" "b 11f\n" "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 10f\n" "ldr h6, [x14], #0x2\n" "ldr h5, [x13], #0x2\n" "ldr h4, [x10], #0x2\n" "ldr h3, [x9], #0x2\n" "ldr h1, [x28], #0x2\n" "ldr h0, [x27], #0x2\n" "ldr h31, [x26], #0x2\n" "ldr h30, [x25], #0x2\n" "tbz %x[n_channels], #0, 11f\n" "ld1 { v6.b }[2], [x14], #0x1\n" "ld1 { v5.b }[2], [x13], #0x1\n" "ld1 { v4.b }[2], [x10], #0x1\n" "ld1 { v3.b }[2], [x9], #0x1\n" "ld1 { v1.b }[2], [x28], #0x1\n" "ld1 { v0.b }[2], [x27], #0x1\n" "ld1 { v31.b }[2], [x26], #0x1\n" "ld1 { v30.b }[2], [x25], #0x1\n" "b 11f\n" "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b6, [x14], #0x1\n" "ldr b5, [x13], #0x1\n" "ldr b4, [x10], #0x1\n" "ldr b3, [x9], #0x1\n" "ldr b1, [x28], #0x1\n" "ldr b0, [x27], #0x1\n" "ldr b31, [x26], #0x1\n" "ldr b30, [x25], #0x1\n" "11:" // Oddments: Load (A): Bit 3: End "ldp x14, x13, [%x[inptrs], #0x40]\n" "ldp x10, x9, [%x[inptrs], #0x50]\n" "add x14, x14, x12\n" "add x13, x13, x12\n" "ldp x28, x27, [%x[inptrs], #0x60]\n" "ldp x26, x25, [%x[inptrs], #0x70]\n" "add x10, x10, x12\n" "add x9, x9, x12\n" "add x28, x28, x12\n" "add x27, x27, x12\n" "add x26, x26, x12\n" "add x25, x25, x12\n" "tbz %x[n_channels], #3, 15f\n" "ldr d25, [x14], #0x8\n" "ldr d24, [x13], #0x8\n" "ldr d23, [x10], #0x8\n" "ldr d22, [x9], #0x8\n" "ldr d20, [x28], #0x8\n" "ldr d19, [x27], #0x8\n" "ldr d18, [x26], #0x8\n" "ldr d17, [x25], #0x8\n" "tbz %x[n_channels], #2, 13f\n" "ld1 { v25.s }[2], [x14], #0x4\n" "ld1 { v24.s }[2], [x13], #0x4\n" "ld1 { v23.s }[2], [x10], #0x4\n" "ld1 { v22.s }[2], [x9], #0x4\n" "ld1 { v20.s }[2], [x28], #0x4\n" "ld1 { v19.s }[2], [x27], #0x4\n" "ld1 { v18.s }[2], [x26], #0x4\n" "ld1 { v17.s }[2], [x25], #0x4\n" "tbz %x[n_channels], #1, 12f\n" "ld1 { v25.h }[6], [x14], #0x2\n" "ld1 { v24.h }[6], [x13], #0x2\n" "ld1 { v23.h }[6], [x10], #0x2\n" "ld1 { v22.h }[6], [x9], #0x2\n" "ld1 { v20.h }[6], [x28], #0x2\n" "ld1 { v19.h }[6], [x27], #0x2\n" "ld1 { v18.h }[6], [x26], #0x2\n" "ld1 { v17.h }[6], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[14], [x14], #0x1\n" "ld1 { v24.b }[14], [x13], #0x1\n" "ld1 { v23.b }[14], [x10], #0x1\n" "ld1 { v22.b }[14], [x9], #0x1\n" "ld1 { v20.b }[14], [x28], #0x1\n" "ld1 { v19.b }[14], [x27], #0x1\n" "ld1 { v18.b }[14], [x26], #0x1\n" "ld1 { v17.b }[14], [x25], #0x1\n" "b 19f\n" "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[12], [x14], #0x1\n" "ld1 { v24.b }[12], [x13], #0x1\n" "ld1 { v23.b }[12], [x10], #0x1\n" "ld1 { v22.b }[12], [x9], #0x1\n" "ld1 { v20.b }[12], [x28], #0x1\n" "ld1 { v19.b }[12], [x27], #0x1\n" "ld1 { v18.b }[12], [x26], #0x1\n" "ld1 { v17.b }[12], [x25], #0x1\n" "b 19f\n" "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset "tbz %x[n_channels], #1, 14f\n" "ld1 { v25.h }[4], [x14], #0x2\n" "ld1 { v24.h }[4], [x13], #0x2\n" "ld1 { v23.h }[4], [x10], #0x2\n" "ld1 { v22.h }[4], [x9], #0x2\n" "ld1 { v20.h }[4], [x28], #0x2\n" "ld1 { v19.h }[4], [x27], #0x2\n" "ld1 { v18.h }[4], [x26], #0x2\n" "ld1 { v17.h }[4], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[10], [x14], #0x1\n" "ld1 { v24.b }[10], [x13], #0x1\n" "ld1 { v23.b }[10], [x10], #0x1\n" "ld1 { v22.b }[10], [x9], #0x1\n" "ld1 { v20.b }[10], [x28], #0x1\n" "ld1 { v19.b }[10], [x27], #0x1\n" "ld1 { v18.b }[10], [x26], #0x1\n" "ld1 { v17.b }[10], [x25], #0x1\n" "b 19f\n" "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[8], [x14], #0x1\n" "ld1 { v24.b }[8], [x13], #0x1\n" "ld1 { v23.b }[8], [x10], #0x1\n" "ld1 { v22.b }[8], [x9], #0x1\n" "ld1 { v20.b }[8], [x28], #0x1\n" "ld1 { v19.b }[8], [x27], #0x1\n" "ld1 { v18.b }[8], [x26], #0x1\n" "ld1 { v17.b }[8], [x25], #0x1\n" "b 19f\n" "15:" // Oddments: Load (B): Bit 3: Unset "tbz %x[n_channels], #2, 17f\n" "ldr s25, [x14], #0x4\n" "ldr s24, [x13], #0x4\n" "ldr s23, [x10], #0x4\n" "ldr s22, [x9], #0x4\n" "ldr s20, [x28], #0x4\n" "ldr s19, [x27], #0x4\n" "ldr s18, [x26], #0x4\n" "ldr s17, [x25], #0x4\n" "tbz %x[n_channels], #1, 16f\n" "ld1 { v25.h }[2], [x14], #0x2\n" "ld1 { v24.h }[2], [x13], #0x2\n" "ld1 { v23.h }[2], [x10], #0x2\n" "ld1 { v22.h }[2], [x9], #0x2\n" "ld1 { v20.h }[2], [x28], #0x2\n" "ld1 { v19.h }[2], [x27], #0x2\n" "ld1 { v18.h }[2], [x26], #0x2\n" "ld1 { v17.h }[2], [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[6], [x14], #0x1\n" "ld1 { v24.b }[6], [x13], #0x1\n" "ld1 { v23.b }[6], [x10], #0x1\n" "ld1 { v22.b }[6], [x9], #0x1\n" "ld1 { v20.b }[6], [x28], #0x1\n" "ld1 { v19.b }[6], [x27], #0x1\n" "ld1 { v18.b }[6], [x26], #0x1\n" "ld1 { v17.b }[6], [x25], #0x1\n" "b 19f\n" "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[4], [x14], #0x1\n" "ld1 { v24.b }[4], [x13], #0x1\n" "ld1 { v23.b }[4], [x10], #0x1\n" "ld1 { v22.b }[4], [x9], #0x1\n" "ld1 { v20.b }[4], [x28], #0x1\n" "ld1 { v19.b }[4], [x27], #0x1\n" "ld1 { v18.b }[4], [x26], #0x1\n" "ld1 { v17.b }[4], [x25], #0x1\n" "b 19f\n" "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset "tbz %x[n_channels], #1, 18f\n" "ldr h25, [x14], #0x2\n" "ldr h24, [x13], #0x2\n" "ldr h23, [x10], #0x2\n" "ldr h22, [x9], #0x2\n" "ldr h20, [x28], #0x2\n" "ldr h19, [x27], #0x2\n" "ldr h18, [x26], #0x2\n" "ldr h17, [x25], #0x2\n" "tbz %x[n_channels], #0, 19f\n" "ld1 { v25.b }[2], [x14], #0x1\n" "ld1 { v24.b }[2], [x13], #0x1\n" "ld1 { v23.b }[2], [x10], #0x1\n" "ld1 { v22.b }[2], [x9], #0x1\n" "ld1 { v20.b }[2], [x28], #0x1\n" "ld1 { v19.b }[2], [x27], #0x1\n" "ld1 { v18.b }[2], [x26], #0x1\n" "ld1 { v17.b }[2], [x25], #0x1\n" "b 19f\n" "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset "ldr b25, [x14], #0x1\n" "ldr b24, [x13], #0x1\n" "ldr b23, [x10], #0x1\n" "ldr b22, [x9], #0x1\n" "ldr b20, [x28], #0x1\n" "ldr b19, [x27], #0x1\n" "ldr b18, [x26], #0x1\n" "ldr b17, [x25], #0x1\n" "19:" // Oddments: Load (B): Bit 3: End "ldr q29, [%x[params], #0x10]\n" "ldr q28, [%x[params], #0x20]\n" "zip2 v2.16b, v6.16b, v4.16b\n" "zip1 v6.16b, v6.16b, v4.16b\n" "ldr q27, [%x[params], #0x30]\n" "zip1 v4.16b, v5.16b, v3.16b\n" "zip2 v3.16b, v5.16b, v3.16b\n" "cmp x20, #0x4\n" "zip2 v5.16b, v6.16b, v4.16b\n" "zip1 v6.16b, v6.16b, v4.16b\n" "zip1 v4.16b, v2.16b, v3.16b\n" "zip2 v3.16b, v2.16b, v3.16b\n" "ldr q2, [%x[params], #0x0]\n" "zip2 v26.16b, v1.16b, v31.16b\n" "zip1 v1.16b, v1.16b, v31.16b\n" "zip1 v31.16b, v0.16b, v30.16b\n" "zip2 v30.16b, v0.16b, v30.16b\n" "zip2 v21.16b, v25.16b, v23.16b\n" "zip1 v25.16b, v25.16b, v23.16b\n" "zip1 v23.16b, v24.16b, v22.16b\n" "zip2 v22.16b, v24.16b, v22.16b\n" "zip2 v16.16b, v20.16b, v18.16b\n" "zip1 v20.16b, v20.16b, v18.16b\n" "zip1 v18.16b, v19.16b, v17.16b\n" "zip2 v17.16b, v19.16b, v17.16b\n" "zip2 v0.16b, v1.16b, v31.16b\n" "zip1 v1.16b, v1.16b, v31.16b\n" "zip1 v31.16b, v26.16b, v30.16b\n" "zip2 v30.16b, v26.16b, v30.16b\n" "zip2 v24.16b, v25.16b, v23.16b\n" "zip1 v25.16b, v25.16b, v23.16b\n" "zip1 v23.16b, v21.16b, v22.16b\n" "zip2 v22.16b, v21.16b, v22.16b\n" "zip2 v19.16b, v20.16b, v18.16b\n" "zip1 v20.16b, v20.16b, v18.16b\n" "zip1 v18.16b, v16.16b, v17.16b\n" "zip2 v17.16b, v16.16b, v17.16b\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" ".inst 0x4e8197b5 // sdot v21.4s, v29.16b, v1.16b\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8697a2 // sdot v2.4s, v29.16b, v6.16b\n" ".inst 0x4e819782 // sdot v2.4s, v28.16b, v1.16b\n" "ext v6.16b, v6.16b, v6.16b, #0x1\n" "ext v1.16b, v1.16b, v1.16b, #0x1\n" ".inst 0x4e8697ba // sdot v26.4s, v29.16b, v6.16b\n" "ldr q6, [%x[params], #0x40]\n" ".inst 0x4e8197b0 // sdot v16.4s, v29.16b, v1.16b\n" ".inst 0x4e999795 // sdot v21.4s, v28.16b, v25.16b\n" ".inst 0x4e999762 // sdot v2.4s, v27.16b, v25.16b\n" "ext v25.16b, v25.16b, v25.16b, #0x1\n" ".inst 0x4e81979a // sdot v26.4s, v28.16b, v1.16b\n" "ldr q1, [%x[params], #0x50]\n" ".inst 0x4e999790 // sdot v16.4s, v28.16b, v25.16b\n" ".inst 0x4e949775 // sdot v21.4s, v27.16b, v20.16b\n" "ext v20.16b, v20.16b, v20.16b, #0x1\n" "add %x[params], %x[params], #0x60\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e99977a // sdot v26.4s, v27.16b, v25.16b\n" ".inst 0x4e949770 // sdot v16.4s, v27.16b, v20.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v28.4s\n" "sqadd v21.4s, v21.4s, v27.4s\n" "sqadd v16.4s, v16.4s, v29.4s\n" "srshl v2.4s, v2.4s, v1.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "add v2.4s, v2.4s, v7.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smax v2.4s, v2.4s, v9.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "blt 20f\n" "str s2, [x24, x11]\n" "str s26, [x23, x11]\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "b 23f\n" "20:" // Oddments: Unroll 0: Oddment store "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "add x21, x21, x11\n" "tbz x20, #1, 21f\n" "st1 { v2.h }[0], [x24], #0x2\n" "st1 { v26.h }[0], [x23], #0x2\n" "st1 { v21.h }[0], [x22], #0x2\n" "st1 { v16.h }[0], [x21], #0x2\n" "tbz x20, #0, 22f\n" "st1 { v2.b }[2], [x24], #0x1\n" "st1 { v26.b }[2], [x23], #0x1\n" "st1 { v21.b }[2], [x22], #0x1\n" "st1 { v16.b }[2], [x21], #0x1\n" "b 22f\n" "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset "st1 { v2.b }[0], [x24], #0x1\n" "st1 { v26.b }[0], [x23], #0x1\n" "st1 { v21.b }[0], [x22], #0x1\n" "st1 { v16.b }[0], [x21], #0x1\n" "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End "23:" // Oddments: Unroll 0: After oddment store "subs x20, x20, #0x4\n" "add x11, x11, #0x4\n" "ble 35f\n" "ldr q2, [%x[params], #0x0]\n" "ldr q29, [%x[params], #0x10]\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" "ldr q28, [%x[params], #0x20]\n" "ldr q27, [%x[params], #0x30]\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8597a2 // sdot v2.4s, v29.16b, v5.16b\n" "ldr q6, [%x[params], #0x40]\n" "ldr q1, [%x[params], #0x50]\n" ".inst 0x4e8097b5 // sdot v21.4s, v29.16b, v0.16b\n" ".inst 0x4e809782 // sdot v2.4s, v28.16b, v0.16b\n" "ext v5.16b, v5.16b, v5.16b, #0x1\n" "ext v0.16b, v0.16b, v0.16b, #0x1\n" ".inst 0x4e8597ba // sdot v26.4s, v29.16b, v5.16b\n" "cmp x20, #0x4\n" ".inst 0x4e8097b0 // sdot v16.4s, v29.16b, v0.16b\n" ".inst 0x4e989795 // sdot v21.4s, v28.16b, v24.16b\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e989762 // sdot v2.4s, v27.16b, v24.16b\n" "ext v24.16b, v24.16b, v24.16b, #0x1\n" ".inst 0x4e80979a // sdot v26.4s, v28.16b, v0.16b\n" ".inst 0x4e989790 // sdot v16.4s, v28.16b, v24.16b\n" ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n" "ext v19.16b, v19.16b, v19.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e98977a // sdot v26.4s, v27.16b, v24.16b\n" ".inst 0x4e939770 // sdot v16.4s, v27.16b, v19.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v28.4s\n" "sqadd v21.4s, v21.4s, v27.4s\n" "sqadd v16.4s, v16.4s, v29.4s\n" "srshl v2.4s, v2.4s, v1.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "add v2.4s, v2.4s, v7.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smax v2.4s, v2.4s, v9.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "blt 24f\n" "str s2, [x24, x11]\n" "str s26, [x23, x11]\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "b 27f\n" "24:" // Oddments: Unroll 1: Oddment store "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "add x21, x21, x11\n" "tbz x20, #1, 25f\n" "st1 { v2.h }[0], [x24], #0x2\n" "st1 { v26.h }[0], [x23], #0x2\n" "st1 { v21.h }[0], [x22], #0x2\n" "st1 { v16.h }[0], [x21], #0x2\n" "tbz x20, #0, 26f\n" "st1 { v2.b }[2], [x24], #0x1\n" "st1 { v26.b }[2], [x23], #0x1\n" "st1 { v21.b }[2], [x22], #0x1\n" "st1 { v16.b }[2], [x21], #0x1\n" "b 26f\n" "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset "st1 { v2.b }[0], [x24], #0x1\n" "st1 { v26.b }[0], [x23], #0x1\n" "st1 { v21.b }[0], [x22], #0x1\n" "st1 { v16.b }[0], [x21], #0x1\n" "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End "27:" // Oddments: Unroll 1: After oddment store "subs x20, x20, #0x4\n" "add x11, x11, #0x4\n" "ble 35f\n" "ldr q2, [%x[params], #0x0]\n" "ldr q29, [%x[params], #0x10]\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" "ldr q28, [%x[params], #0x20]\n" "ldr q27, [%x[params], #0x30]\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8497a2 // sdot v2.4s, v29.16b, v4.16b\n" "ldr q6, [%x[params], #0x40]\n" "ldr q1, [%x[params], #0x50]\n" ".inst 0x4e9f97b5 // sdot v21.4s, v29.16b, v31.16b\n" ".inst 0x4e9f9782 // sdot v2.4s, v28.16b, v31.16b\n" "ext v4.16b, v4.16b, v4.16b, #0x1\n" "ext v31.16b, v31.16b, v31.16b, #0x1\n" ".inst 0x4e8497ba // sdot v26.4s, v29.16b, v4.16b\n" "cmp x20, #0x4\n" ".inst 0x4e9f97b0 // sdot v16.4s, v29.16b, v31.16b\n" ".inst 0x4e979795 // sdot v21.4s, v28.16b, v23.16b\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e979762 // sdot v2.4s, v27.16b, v23.16b\n" "ext v23.16b, v23.16b, v23.16b, #0x1\n" ".inst 0x4e9f979a // sdot v26.4s, v28.16b, v31.16b\n" ".inst 0x4e979790 // sdot v16.4s, v28.16b, v23.16b\n" ".inst 0x4e929775 // sdot v21.4s, v27.16b, v18.16b\n" "ext v18.16b, v18.16b, v18.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e97977a // sdot v26.4s, v27.16b, v23.16b\n" ".inst 0x4e929770 // sdot v16.4s, v27.16b, v18.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v28.4s\n" "sqadd v21.4s, v21.4s, v27.4s\n" "sqadd v16.4s, v16.4s, v29.4s\n" "srshl v2.4s, v2.4s, v1.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "add v2.4s, v2.4s, v7.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smax v2.4s, v2.4s, v9.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "blt 28f\n" "str s2, [x24, x11]\n" "str s26, [x23, x11]\n" "str s21, [x22, x11]\n" "str s16, [x21, x11]\n" "b 31f\n" "28:" // Oddments: Unroll 2: Oddment store "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "add x21, x21, x11\n" "tbz x20, #1, 29f\n" "st1 { v2.h }[0], [x24], #0x2\n" "st1 { v26.h }[0], [x23], #0x2\n" "st1 { v21.h }[0], [x22], #0x2\n" "st1 { v16.h }[0], [x21], #0x2\n" "tbz x20, #0, 30f\n" "st1 { v2.b }[2], [x24], #0x1\n" "st1 { v26.b }[2], [x23], #0x1\n" "st1 { v21.b }[2], [x22], #0x1\n" "st1 { v16.b }[2], [x21], #0x1\n" "b 30f\n" "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset "st1 { v2.b }[0], [x24], #0x1\n" "st1 { v26.b }[0], [x23], #0x1\n" "st1 { v21.b }[0], [x22], #0x1\n" "st1 { v16.b }[0], [x21], #0x1\n" "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End "31:" // Oddments: Unroll 2: After oddment store "subs x20, x20, #0x4\n" "add x11, x11, #0x4\n" "ble 35f\n" "ldr q2, [%x[params], #0x0]\n" "ldr q29, [%x[params], #0x10]\n" "mov v26.16b, v2.16b\n" "mov v21.16b, v2.16b\n" "ldr q28, [%x[params], #0x20]\n" "ldr q27, [%x[params], #0x30]\n" "mov v16.16b, v2.16b\n" ".inst 0x4e8397a2 // sdot v2.4s, v29.16b, v3.16b\n" "ldr q6, [%x[params], #0x40]\n" "ldr q1, [%x[params], #0x50]\n" ".inst 0x4e9e97b5 // sdot v21.4s, v29.16b, v30.16b\n" ".inst 0x4e9e9782 // sdot v2.4s, v28.16b, v30.16b\n" "ext v3.16b, v3.16b, v3.16b, #0x1\n" "ext v30.16b, v30.16b, v30.16b, #0x1\n" ".inst 0x4e8397ba // sdot v26.4s, v29.16b, v3.16b\n" "add %x[params], %x[params], #0x60\n" ".inst 0x4e9e97b0 // sdot v16.4s, v29.16b, v30.16b\n" ".inst 0x4e969795 // sdot v21.4s, v28.16b, v22.16b\n" ".inst 0x4e969762 // sdot v2.4s, v27.16b, v22.16b\n" "ext v22.16b, v22.16b, v22.16b, #0x1\n" ".inst 0x4e9e979a // sdot v26.4s, v28.16b, v30.16b\n" ".inst 0x4e969790 // sdot v16.4s, v28.16b, v22.16b\n" ".inst 0x4e919775 // sdot v21.4s, v27.16b, v17.16b\n" "ext v17.16b, v17.16b, v17.16b, #0x1\n" "sqrdmulh v2.4s, v2.4s, v6.4s\n" ".inst 0x4e96977a // sdot v26.4s, v27.16b, v22.16b\n" ".inst 0x4e919770 // sdot v16.4s, v27.16b, v17.16b\n" "and v29.16b, v2.16b, v1.16b\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqrdmulh v26.4s, v26.4s, v6.4s\n" "sqrdmulh v21.4s, v21.4s, v6.4s\n" "sqrdmulh v16.4s, v16.4s, v6.4s\n" "sqadd v2.4s, v2.4s, v29.4s\n" "and v28.16b, v26.16b, v1.16b\n" "and v27.16b, v21.16b, v1.16b\n" "and v29.16b, v16.16b, v1.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" "sshr v27.4s, v27.4s, #0x1f\n" "sshr v29.4s, v29.4s, #0x1f\n" "sqadd v26.4s, v26.4s, v28.4s\n" "sqadd v21.4s, v21.4s, v27.4s\n" "sqadd v16.4s, v16.4s, v29.4s\n" "srshl v2.4s, v2.4s, v1.4s\n" "srshl v26.4s, v26.4s, v1.4s\n" "srshl v21.4s, v21.4s, v1.4s\n" "srshl v16.4s, v16.4s, v1.4s\n" "add v2.4s, v2.4s, v7.4s\n" "add v26.4s, v26.4s, v7.4s\n" "add v21.4s, v21.4s, v7.4s\n" "add v16.4s, v16.4s, v7.4s\n" "smax v2.4s, v2.4s, v9.4s\n" "smax v26.4s, v26.4s, v9.4s\n" "smax v21.4s, v21.4s, v9.4s\n" "smax v16.4s, v16.4s, v9.4s\n" "smin v2.4s, v2.4s, v8.4s\n" "smin v26.4s, v26.4s, v8.4s\n" "smin v21.4s, v21.4s, v8.4s\n" "smin v16.4s, v16.4s, v8.4s\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "uzp1 v2.16b, v2.16b, v2.16b\n" "uzp1 v26.16b, v26.16b, v26.16b\n" "uzp1 v21.16b, v21.16b, v21.16b\n" "uzp1 v16.16b, v16.16b, v16.16b\n" "32:" // Oddments: Unroll 3: Oddment store "add x24, x24, x11\n" "add x23, x23, x11\n" "add x22, x22, x11\n" "add x21, x21, x11\n" "tbz x20, #1, 33f\n" "st1 { v2.h }[0], [x24], #0x2\n" "st1 { v26.h }[0], [x23], #0x2\n" "st1 { v21.h }[0], [x22], #0x2\n" "st1 { v16.h }[0], [x21], #0x2\n" "tbz x20, #0, 34f\n" "st1 { v2.b }[2], [x24], #0x1\n" "st1 { v26.b }[2], [x23], #0x1\n" "st1 { v21.b }[2], [x22], #0x1\n" "st1 { v16.b }[2], [x21], #0x1\n" "b 34f\n" "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset "st1 { v2.b }[0], [x24], #0x1\n" "st1 { v26.b }[0], [x23], #0x1\n" "st1 { v21.b }[0], [x22], #0x1\n" "st1 { v16.b }[0], [x21], #0x1\n" "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End "35:" // End : [params] "+&r" (params) : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } } // namespace depthwise } // namespace arm_conv #endif // defined(__aarch64__)