/* * Copyright (c) 2024 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #ifdef __aarch64__ template<> void MergeResults<12, 8, false>( bfloat16 *out_ptr, const float * in_ptr, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const bfloat16 *bias, Activation act, bool accumulate) { float maxval = static_cast(std::numeric_limits::infinity()); float minval = - static_cast(std::numeric_limits::infinity()); switch(act.type) { default: case Activation::Type::None: break; case Activation::Type::BoundedReLU: maxval = static_cast(act.param1); /* fall through */ case Activation::Type::ReLU: minval = 0; break; } size_t rows = ymax-y0; size_t cols = xmax-x0; out_ptr += (y0 * ldout) + x0; bias = (bias == nullptr) ? nullptr : bias + x0; __asm__ __volatile__( "cbz %x[cols], 108f\n" "cbz %x[rows], 108f\n" "mov x11, #0x20\n" "dup v13.4s, %w[maxval]\n" "dup v12.4s, %w[minval]\n" "mul x11, %x[ldout], x11\n" "cbnz %x[accumulate], 66f\n" "1:" // Initial: Row loop "cmp %x[rows], #0x7\n" "bgt 58f\n" "beq 50f\n" "cmp %x[rows], #0x5\n" "bgt 42f\n" "beq 34f\n" "cmp %x[rows], #0x3\n" "bgt 26f\n" "beq 18f\n" "cmp %x[rows], #0x1\n" "bgt 10f\n" "2:" // Initial: Height 1 "mov x10, %x[cols]\n" "mov x9, %x[out_ptr]\n" "mov x28, %x[bias]\n" "cmp x10, #0xc\n" "blt 6f\n" "3:" // Initial: Height 1: Block loop "cbnz %x[bias], 4f\n" "movi v21.16b, #0x0\n" "movi v20.16b, #0x0\n" "movi v19.16b, #0x0\n" "b 5f\n" "4:" // Initial: Height 1: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v21.4s, v18.4h, #0x10\n" "shll v20.4s, v17.4h, #0x10\n" "shll v19.4s, v16.4h, #0x10\n" "5:" // Initial: Height 1: Width 3: init done "ldr q18, [%x[in_ptr], #0x0]\n" "ldr q17, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q16, [%x[in_ptr], #0x20]\n" "cmp x10, #0xc\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v18.4s, v18.4s, v21.4s\n" "fadd v17.4s, v17.4s, v20.4s\n" "fadd v16.4s, v16.4s, v19.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" "str d18, [x9, #0x0]\n" "str d17, [x9, #0x8]\n" "str d16, [x9, #0x10]\n" "add x9, x9, #0x18\n" "bge 3b\n" "6:" // Initial: Height 1: no full blocks "cbz x10, 9f\n" "mov x20, %x[in_ptr]\n" "7:" // Initial: Height 1: Single loop "movi v17.16b, #0x0\n" "cbz %x[bias], 8f\n" "ldr h16, [x28, #0x0]\n" "shll v17.4s, v16.4h, #0x10\n" "8:" // Initial: Height 1: Scalar: no bias "ldr s16, [%x[in_ptr], #0x0]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v16.4s, v16.4s, v17.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" "bne 7b\n" "add %x[in_ptr], x20, #0x180\n" "9:" // Initial: Height 1: no oddments "b 108f\n" "10:" // Initial: Height 2 "mov x10, %x[cols]\n" "mov x9, %x[out_ptr]\n" "mov x28, %x[bias]\n" "cmp x10, #0xc\n" "add x27, x9, %x[ldout], LSL #1\n" "blt 14f\n" "11:" // Initial: Height 2: Block loop "cbnz %x[bias], 12f\n" "movi v24.16b, #0x0\n" "movi v23.16b, #0x0\n" "movi v22.16b, #0x0\n" "b 13f\n" "12:" // Initial: Height 2: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v24.4s, v18.4h, #0x10\n" "shll v23.4s, v17.4h, #0x10\n" "shll v22.4s, v16.4h, #0x10\n" "13:" // Initial: Height 2: Width 3: init done "ldr q16, [%x[in_ptr], #0x0]\n" "ldr q20, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q19, [%x[in_ptr], #0x20]\n" "ldr q18, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q17, [%x[in_ptr], #0x40]\n" "ldr q21, [%x[in_ptr], #0x50]\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v16.4s, v16.4s, v24.4s\n" "fadd v20.4s, v20.4s, v23.4s\n" "fadd v19.4s, v19.4s, v22.4s\n" "fadd v18.4s, v18.4s, v24.4s\n" "fadd v17.4s, v17.4s, v23.4s\n" "fadd v21.4s, v21.4s, v22.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str d16, [x9, #0x0]\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" "str d20, [x9, #0x8]\n" "str d19, [x9, #0x10]\n" "add x9, x9, #0x18\n" "str d18, [x27, #0x0]\n" "str d17, [x27, #0x8]\n" "str d16, [x27, #0x10]\n" "add x27, x27, #0x18\n" "bge 11b\n" "14:" // Initial: Height 2: no full blocks "cbz x10, 17f\n" "mov x20, %x[in_ptr]\n" "15:" // Initial: Height 2: Single loop "movi v18.16b, #0x0\n" "cbz %x[bias], 16f\n" "ldr h16, [x28, #0x0]\n" "shll v18.4s, v16.4h, #0x10\n" "16:" // Initial: Height 2: Scalar: no bias "ldr s17, [%x[in_ptr], #0x0]\n" "ldr s16, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v17.4s, v17.4s, v18.4s\n" "fadd v16.4s, v16.4s, v18.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" "str h17, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h16, [x27, #0x0]\n" "add x27, x27, #0x2\n" "bne 15b\n" "add %x[in_ptr], x20, #0x180\n" "17:" // Initial: Height 2: no oddments "b 108f\n" "18:" // Initial: Height 3 "mov x10, %x[cols]\n" "mov x9, %x[out_ptr]\n" "mov x28, %x[bias]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "blt 22f\n" "19:" // Initial: Height 3: Block loop "cbnz %x[bias], 20f\n" "movi v27.16b, #0x0\n" "movi v26.16b, #0x0\n" "movi v25.16b, #0x0\n" "b 21f\n" "20:" // Initial: Height 3: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v27.4s, v18.4h, #0x10\n" "shll v26.4s, v17.4h, #0x10\n" "shll v25.4s, v16.4h, #0x10\n" "21:" // Initial: Height 3: Width 3: init done "ldr q18, [%x[in_ptr], #0x0]\n" "ldr q17, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q16, [%x[in_ptr], #0x20]\n" "ldr q21, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q20, [%x[in_ptr], #0x40]\n" "ldr q19, [%x[in_ptr], #0x50]\n" "ldr q24, [%x[in_ptr], #0x60]\n" "ldr q23, [%x[in_ptr], #0x70]\n" "fadd v18.4s, v18.4s, v27.4s\n" "fadd v17.4s, v17.4s, v26.4s\n" "ldr q22, [%x[in_ptr], #0x80]\n" "fadd v16.4s, v16.4s, v25.4s\n" "fadd v21.4s, v21.4s, v27.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v20.4s, v20.4s, v26.4s\n" "fadd v19.4s, v19.4s, v25.4s\n" "fadd v24.4s, v24.4s, v27.4s\n" "fadd v23.4s, v23.4s, v26.4s\n" "fadd v22.4s, v22.4s, v25.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" "str d18, [x9, #0x0]\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16b12 // bfcvtn v18.4h, v24.4s\n" "str d17, [x9, #0x8]\n" "str d16, [x9, #0x10]\n" ".inst 0x0ea16af1 // bfcvtn v17.4h, v23.4s\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "add x9, x9, #0x18\n" "str d21, [x27, #0x0]\n" "str d20, [x27, #0x8]\n" "str d19, [x27, #0x10]\n" "add x27, x27, #0x18\n" "str d18, [x26, #0x0]\n" "str d17, [x26, #0x8]\n" "str d16, [x26, #0x10]\n" "add x26, x26, #0x18\n" "bge 19b\n" "22:" // Initial: Height 3: no full blocks "cbz x10, 25f\n" "mov x20, %x[in_ptr]\n" "23:" // Initial: Height 3: Single loop "movi v19.16b, #0x0\n" "cbz %x[bias], 24f\n" "ldr h16, [x28, #0x0]\n" "shll v19.4s, v16.4h, #0x10\n" "24:" // Initial: Height 3: Scalar: no bias "ldr s16, [%x[in_ptr], #0x0]\n" "ldr s17, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "ldr s18, [%x[in_ptr], #0x60]\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v16.4s, v16.4s, v19.4s\n" "fadd v17.4s, v17.4s, v19.4s\n" "fadd v18.4s, v18.4s, v19.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16a50 // bfcvtn v16.4h, v18.4s\n" "str h17, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h16, [x26, #0x0]\n" "add x26, x26, #0x2\n" "bne 23b\n" "add %x[in_ptr], x20, #0x180\n" "25:" // Initial: Height 3: no oddments "b 108f\n" "26:" // Initial: Height 4 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "mov x28, %x[bias]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x25, x26, %x[ldout], LSL #1\n" "blt 30f\n" "27:" // Initial: Height 4: Block loop "cbnz %x[bias], 28f\n" "movi v30.16b, #0x0\n" "movi v29.16b, #0x0\n" "movi v28.16b, #0x0\n" "b 29f\n" "28:" // Initial: Height 4: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v30.4s, v18.4h, #0x10\n" "shll v29.4s, v17.4h, #0x10\n" "shll v28.4s, v16.4h, #0x10\n" "29:" // Initial: Height 4: Width 3: init done "ldr q19, [%x[in_ptr], #0x0]\n" "ldr q18, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q17, [%x[in_ptr], #0x20]\n" "ldr q16, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q23, [%x[in_ptr], #0x40]\n" "ldr q22, [%x[in_ptr], #0x50]\n" "ldr q21, [%x[in_ptr], #0x60]\n" "ldr q20, [%x[in_ptr], #0x70]\n" "fadd v19.4s, v19.4s, v30.4s\n" "fadd v18.4s, v18.4s, v29.4s\n" "ldr q27, [%x[in_ptr], #0x80]\n" "ldr q26, [%x[in_ptr], #0x90]\n" "fadd v17.4s, v17.4s, v28.4s\n" "fadd v16.4s, v16.4s, v30.4s\n" "ldr q25, [%x[in_ptr], #0xa0]\n" "ldr q24, [%x[in_ptr], #0xb0]\n" "fadd v23.4s, v23.4s, v29.4s\n" "fadd v22.4s, v22.4s, v28.4s\n" "fadd v21.4s, v21.4s, v30.4s\n" "fadd v20.4s, v20.4s, v29.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v27.4s, v27.4s, v28.4s\n" "fadd v26.4s, v26.4s, v30.4s\n" "fadd v25.4s, v25.4s, v29.4s\n" "fadd v24.4s, v24.4s, v28.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" "str d19, [x9, #0x0]\n" "str d18, [x9, #0x8]\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" "str d17, [x9, #0x10]\n" ".inst 0x0ea16b73 // bfcvtn v19.4h, v27.4s\n" ".inst 0x0ea16b52 // bfcvtn v18.4h, v26.4s\n" "add x9, x9, #0x18\n" "str d16, [x27, #0x0]\n" ".inst 0x0ea16b31 // bfcvtn v17.4h, v25.4s\n" ".inst 0x0ea16b10 // bfcvtn v16.4h, v24.4s\n" "str d23, [x27, #0x8]\n" "str d22, [x27, #0x10]\n" "add x27, x27, #0x18\n" "str d21, [x26, #0x0]\n" "str d20, [x26, #0x8]\n" "str d19, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d18, [x25, #0x0]\n" "str d17, [x25, #0x8]\n" "str d16, [x25, #0x10]\n" "add x25, x25, #0x18\n" "bge 27b\n" "30:" // Initial: Height 4: no full blocks "cbz x10, 33f\n" "mov x20, %x[in_ptr]\n" "31:" // Initial: Height 4: Single loop "movi v20.16b, #0x0\n" "cbz %x[bias], 32f\n" "ldr h16, [x28, #0x0]\n" "shll v20.4s, v16.4h, #0x10\n" "32:" // Initial: Height 4: Scalar: no bias "ldr s16, [%x[in_ptr], #0x0]\n" "ldr s18, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "ldr s17, [%x[in_ptr], #0x60]\n" "ldr s19, [%x[in_ptr], #0x90]\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v16.4s, v16.4s, v20.4s\n" "fadd v18.4s, v18.4s, v20.4s\n" "fadd v17.4s, v17.4s, v20.4s\n" "fadd v19.4s, v19.4s, v20.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16a70 // bfcvtn v16.4h, v19.4s\n" "str h18, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h17, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h16, [x25, #0x0]\n" "add x25, x25, #0x2\n" "bne 31b\n" "add %x[in_ptr], x20, #0x180\n" "33:" // Initial: Height 4: no oddments "b 108f\n" "34:" // Initial: Height 5 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "mov x28, %x[bias]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x24, x25, %x[ldout], LSL #1\n" "blt 38f\n" "35:" // Initial: Height 5: Block loop "cbnz %x[bias], 36f\n" "movi v1.16b, #0x0\n" "movi v0.16b, #0x0\n" "movi v31.16b, #0x0\n" "b 37f\n" "36:" // Initial: Height 5: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v1.4s, v18.4h, #0x10\n" "shll v0.4s, v17.4h, #0x10\n" "shll v31.4s, v16.4h, #0x10\n" "37:" // Initial: Height 5: Width 3: init done "ldr q16, [%x[in_ptr], #0x0]\n" "ldr q20, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q19, [%x[in_ptr], #0x20]\n" "ldr q18, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q17, [%x[in_ptr], #0x40]\n" "ldr q30, [%x[in_ptr], #0x50]\n" "ldr q24, [%x[in_ptr], #0x60]\n" "ldr q23, [%x[in_ptr], #0x70]\n" "fadd v16.4s, v16.4s, v1.4s\n" "fadd v20.4s, v20.4s, v0.4s\n" "ldr q22, [%x[in_ptr], #0x80]\n" "ldr q21, [%x[in_ptr], #0x90]\n" "fadd v19.4s, v19.4s, v31.4s\n" "fadd v18.4s, v18.4s, v1.4s\n" "ldr q29, [%x[in_ptr], #0xa0]\n" "ldr q28, [%x[in_ptr], #0xb0]\n" "fadd v17.4s, v17.4s, v0.4s\n" "fadd v30.4s, v30.4s, v31.4s\n" "ldr q27, [%x[in_ptr], #0xc0]\n" "ldr q26, [%x[in_ptr], #0xd0]\n" "fadd v24.4s, v24.4s, v1.4s\n" "fadd v23.4s, v23.4s, v0.4s\n" "ldr q25, [%x[in_ptr], #0xe0]\n" "fadd v22.4s, v22.4s, v31.4s\n" "fadd v21.4s, v21.4s, v1.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v29.4s, v29.4s, v0.4s\n" "fadd v28.4s, v28.4s, v31.4s\n" "fadd v27.4s, v27.4s, v1.4s\n" "fadd v26.4s, v26.4s, v0.4s\n" "fadd v25.4s, v25.4s, v31.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str d16, [x9, #0x0]\n" ".inst 0x0ea16bd0 // bfcvtn v16.4h, v30.4s\n" ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n" "str d20, [x9, #0x8]\n" "str d19, [x9, #0x10]\n" ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" "add x9, x9, #0x18\n" "str d18, [x27, #0x0]\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16bb4 // bfcvtn v20.4h, v29.4s\n" "str d17, [x27, #0x8]\n" ".inst 0x0ea16b93 // bfcvtn v19.4h, v28.4s\n" ".inst 0x0ea16b72 // bfcvtn v18.4h, v27.4s\n" "str d16, [x27, #0x10]\n" ".inst 0x0ea16b51 // bfcvtn v17.4h, v26.4s\n" ".inst 0x0ea16b30 // bfcvtn v16.4h, v25.4s\n" "add x27, x27, #0x18\n" "str d24, [x26, #0x0]\n" "str d23, [x26, #0x8]\n" "str d22, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d21, [x25, #0x0]\n" "str d20, [x25, #0x8]\n" "str d19, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d18, [x24, #0x0]\n" "str d17, [x24, #0x8]\n" "str d16, [x24, #0x10]\n" "add x24, x24, #0x18\n" "bge 35b\n" "38:" // Initial: Height 5: no full blocks "cbz x10, 41f\n" "mov x20, %x[in_ptr]\n" "39:" // Initial: Height 5: Single loop "movi v21.16b, #0x0\n" "cbz %x[bias], 40f\n" "ldr h16, [x28, #0x0]\n" "shll v21.4s, v16.4h, #0x10\n" "40:" // Initial: Height 5: Scalar: no bias "ldr s16, [%x[in_ptr], #0x0]\n" "ldr s19, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "ldr s18, [%x[in_ptr], #0x60]\n" "ldr s17, [%x[in_ptr], #0x90]\n" "ldr s20, [%x[in_ptr], #0xc0]\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v16.4s, v16.4s, v21.4s\n" "fadd v19.4s, v19.4s, v21.4s\n" "fadd v18.4s, v18.4s, v21.4s\n" "fadd v17.4s, v17.4s, v21.4s\n" "fadd v20.4s, v20.4s, v21.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16a90 // bfcvtn v16.4h, v20.4s\n" "str h19, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h18, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h17, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h16, [x24, #0x0]\n" "add x24, x24, #0x2\n" "bne 39b\n" "add %x[in_ptr], x20, #0x180\n" "41:" // Initial: Height 5: no oddments "b 108f\n" "42:" // Initial: Height 6 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "mov x28, %x[bias]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x23, x24, %x[ldout], LSL #1\n" "blt 46f\n" "43:" // Initial: Height 6: Block loop "cbnz %x[bias], 44f\n" "movi v4.16b, #0x0\n" "movi v3.16b, #0x0\n" "movi v2.16b, #0x0\n" "b 45f\n" "44:" // Initial: Height 6: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v4.4s, v18.4h, #0x10\n" "shll v3.4s, v17.4h, #0x10\n" "shll v2.4s, v16.4h, #0x10\n" "45:" // Initial: Height 6: Width 3: init done "ldr q21, [%x[in_ptr], #0x0]\n" "ldr q16, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q20, [%x[in_ptr], #0x20]\n" "ldr q19, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q18, [%x[in_ptr], #0x40]\n" "ldr q17, [%x[in_ptr], #0x50]\n" "ldr q1, [%x[in_ptr], #0x60]\n" "ldr q26, [%x[in_ptr], #0x70]\n" "fadd v21.4s, v21.4s, v4.4s\n" "fadd v16.4s, v16.4s, v3.4s\n" "ldr q25, [%x[in_ptr], #0x80]\n" "ldr q24, [%x[in_ptr], #0x90]\n" "fadd v20.4s, v20.4s, v2.4s\n" "fadd v19.4s, v19.4s, v4.4s\n" "ldr q23, [%x[in_ptr], #0xa0]\n" "ldr q22, [%x[in_ptr], #0xb0]\n" "fadd v18.4s, v18.4s, v3.4s\n" "fadd v17.4s, v17.4s, v2.4s\n" "ldr q0, [%x[in_ptr], #0xc0]\n" "ldr q31, [%x[in_ptr], #0xd0]\n" "fadd v1.4s, v1.4s, v4.4s\n" "fadd v26.4s, v26.4s, v3.4s\n" "ldr q30, [%x[in_ptr], #0xe0]\n" "ldr q29, [%x[in_ptr], #0xf0]\n" "fadd v25.4s, v25.4s, v2.4s\n" "fadd v24.4s, v24.4s, v4.4s\n" "ldr q28, [%x[in_ptr], #0x100]\n" "ldr q27, [%x[in_ptr], #0x110]\n" "fadd v23.4s, v23.4s, v3.4s\n" "fadd v22.4s, v22.4s, v2.4s\n" "fadd v0.4s, v0.4s, v4.4s\n" "fadd v31.4s, v31.4s, v3.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v30.4s, v30.4s, v2.4s\n" "fadd v29.4s, v29.4s, v4.4s\n" "fadd v28.4s, v28.4s, v3.4s\n" "fadd v27.4s, v27.4s, v2.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str d21, [x9, #0x0]\n" "str d16, [x9, #0x8]\n" ".inst 0x0ea16830 // bfcvtn v16.4h, v1.4s\n" ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n" "str d20, [x9, #0x10]\n" ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n" ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n" "add x9, x9, #0x18\n" "str d19, [x27, #0x0]\n" ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" "str d18, [x27, #0x8]\n" ".inst 0x0ea16815 // bfcvtn v21.4h, v0.4s\n" ".inst 0x0ea16bf4 // bfcvtn v20.4h, v31.4s\n" "str d17, [x27, #0x10]\n" ".inst 0x0ea16bd3 // bfcvtn v19.4h, v30.4s\n" ".inst 0x0ea16bb2 // bfcvtn v18.4h, v29.4s\n" "add x27, x27, #0x18\n" "str d16, [x26, #0x0]\n" ".inst 0x0ea16b91 // bfcvtn v17.4h, v28.4s\n" ".inst 0x0ea16b70 // bfcvtn v16.4h, v27.4s\n" "str d26, [x26, #0x8]\n" "str d25, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d24, [x25, #0x0]\n" "str d23, [x25, #0x8]\n" "str d22, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d21, [x24, #0x0]\n" "str d20, [x24, #0x8]\n" "str d19, [x24, #0x10]\n" "add x24, x24, #0x18\n" "str d18, [x23, #0x0]\n" "str d17, [x23, #0x8]\n" "str d16, [x23, #0x10]\n" "add x23, x23, #0x18\n" "bge 43b\n" "46:" // Initial: Height 6: no full blocks "cbz x10, 49f\n" "mov x20, %x[in_ptr]\n" "47:" // Initial: Height 6: Single loop "movi v22.16b, #0x0\n" "cbz %x[bias], 48f\n" "ldr h16, [x28, #0x0]\n" "shll v22.4s, v16.4h, #0x10\n" "48:" // Initial: Height 6: Scalar: no bias "ldr s16, [%x[in_ptr], #0x0]\n" "ldr s20, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "ldr s19, [%x[in_ptr], #0x60]\n" "ldr s18, [%x[in_ptr], #0x90]\n" "ldr s17, [%x[in_ptr], #0xc0]\n" "ldr s21, [%x[in_ptr], #0xf0]\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v16.4s, v16.4s, v22.4s\n" "fadd v20.4s, v20.4s, v22.4s\n" "fadd v19.4s, v19.4s, v22.4s\n" "fadd v18.4s, v18.4s, v22.4s\n" "fadd v17.4s, v17.4s, v22.4s\n" "fadd v21.4s, v21.4s, v22.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" "str h20, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h19, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h18, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h17, [x24, #0x0]\n" "add x24, x24, #0x2\n" "str h16, [x23, #0x0]\n" "add x23, x23, #0x2\n" "bne 47b\n" "add %x[in_ptr], x20, #0x180\n" "49:" // Initial: Height 6: no oddments "b 108f\n" "50:" // Initial: Height 7 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "mov x28, %x[bias]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x23, x24, %x[ldout], LSL #1\n" "add x22, x23, %x[ldout], LSL #1\n" "blt 54f\n" "51:" // Initial: Height 7: Block loop "cbnz %x[bias], 52f\n" "movi v7.16b, #0x0\n" "movi v6.16b, #0x0\n" "movi v5.16b, #0x0\n" "b 53f\n" "52:" // Initial: Height 7: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v7.4s, v18.4h, #0x10\n" "shll v6.4s, v17.4h, #0x10\n" "shll v5.4s, v16.4h, #0x10\n" "53:" // Initial: Height 7: Width 3: init done "ldr q18, [%x[in_ptr], #0x0]\n" "ldr q17, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q16, [%x[in_ptr], #0x20]\n" "ldr q21, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q20, [%x[in_ptr], #0x40]\n" "ldr q19, [%x[in_ptr], #0x50]\n" "ldr q4, [%x[in_ptr], #0x60]\n" "ldr q3, [%x[in_ptr], #0x70]\n" "fadd v18.4s, v18.4s, v7.4s\n" "fadd v17.4s, v17.4s, v6.4s\n" "ldr q2, [%x[in_ptr], #0x80]\n" "ldr q27, [%x[in_ptr], #0x90]\n" "fadd v16.4s, v16.4s, v5.4s\n" "fadd v21.4s, v21.4s, v7.4s\n" "ldr q26, [%x[in_ptr], #0xa0]\n" "ldr q25, [%x[in_ptr], #0xb0]\n" "fadd v20.4s, v20.4s, v6.4s\n" "fadd v19.4s, v19.4s, v5.4s\n" "ldr q24, [%x[in_ptr], #0xc0]\n" "ldr q23, [%x[in_ptr], #0xd0]\n" "fadd v4.4s, v4.4s, v7.4s\n" "fadd v3.4s, v3.4s, v6.4s\n" "ldr q22, [%x[in_ptr], #0xe0]\n" "ldr q1, [%x[in_ptr], #0xf0]\n" "fadd v2.4s, v2.4s, v5.4s\n" "fadd v27.4s, v27.4s, v7.4s\n" "ldr q0, [%x[in_ptr], #0x100]\n" "ldr q31, [%x[in_ptr], #0x110]\n" "fadd v26.4s, v26.4s, v6.4s\n" "fadd v25.4s, v25.4s, v5.4s\n" "ldr q30, [%x[in_ptr], #0x120]\n" "ldr q29, [%x[in_ptr], #0x130]\n" "fadd v24.4s, v24.4s, v7.4s\n" "fadd v23.4s, v23.4s, v6.4s\n" "ldr q28, [%x[in_ptr], #0x140]\n" "fadd v22.4s, v22.4s, v5.4s\n" "fadd v1.4s, v1.4s, v7.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v0.4s, v0.4s, v6.4s\n" "fadd v31.4s, v31.4s, v5.4s\n" "fadd v30.4s, v30.4s, v7.4s\n" "fadd v29.4s, v29.4s, v6.4s\n" "fadd v28.4s, v28.4s, v5.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v4.4s, v4.4s, v13.4s\n" "fmin v3.4s, v3.4s, v13.4s\n" "fmin v2.4s, v2.4s, v13.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v4.4s, v4.4s, v12.4s\n" "fmax v3.4s, v3.4s, v12.4s\n" "fmax v2.4s, v2.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" "str d18, [x9, #0x0]\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16892 // bfcvtn v18.4h, v4.4s\n" "str d17, [x9, #0x8]\n" "str d16, [x9, #0x10]\n" ".inst 0x0ea16871 // bfcvtn v17.4h, v3.4s\n" ".inst 0x0ea16850 // bfcvtn v16.4h, v2.4s\n" "add x9, x9, #0x18\n" "str d21, [x27, #0x0]\n" ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n" ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n" "str d20, [x27, #0x8]\n" ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n" ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n" "str d19, [x27, #0x10]\n" ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" "add x27, x27, #0x18\n" "str d18, [x26, #0x0]\n" ".inst 0x0ea16835 // bfcvtn v21.4h, v1.4s\n" ".inst 0x0ea16814 // bfcvtn v20.4h, v0.4s\n" "str d17, [x26, #0x8]\n" ".inst 0x0ea16bf3 // bfcvtn v19.4h, v31.4s\n" ".inst 0x0ea16bd2 // bfcvtn v18.4h, v30.4s\n" "str d16, [x26, #0x10]\n" ".inst 0x0ea16bb1 // bfcvtn v17.4h, v29.4s\n" ".inst 0x0ea16b90 // bfcvtn v16.4h, v28.4s\n" "add x26, x26, #0x18\n" "str d27, [x25, #0x0]\n" "str d26, [x25, #0x8]\n" "str d25, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d24, [x24, #0x0]\n" "str d23, [x24, #0x8]\n" "str d22, [x24, #0x10]\n" "add x24, x24, #0x18\n" "str d21, [x23, #0x0]\n" "str d20, [x23, #0x8]\n" "str d19, [x23, #0x10]\n" "add x23, x23, #0x18\n" "str d18, [x22, #0x0]\n" "str d17, [x22, #0x8]\n" "str d16, [x22, #0x10]\n" "add x22, x22, #0x18\n" "bge 51b\n" "54:" // Initial: Height 7: no full blocks "cbz x10, 57f\n" "mov x20, %x[in_ptr]\n" "55:" // Initial: Height 7: Single loop "movi v23.16b, #0x0\n" "cbz %x[bias], 56f\n" "ldr h16, [x28, #0x0]\n" "shll v23.4s, v16.4h, #0x10\n" "56:" // Initial: Height 7: Scalar: no bias "ldr s16, [%x[in_ptr], #0x0]\n" "ldr s21, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "ldr s20, [%x[in_ptr], #0x60]\n" "ldr s19, [%x[in_ptr], #0x90]\n" "ldr s18, [%x[in_ptr], #0xc0]\n" "ldr s17, [%x[in_ptr], #0xf0]\n" "ldr s22, [%x[in_ptr], #0x120]\n" "fadd v16.4s, v16.4s, v23.4s\n" "fadd v21.4s, v21.4s, v23.4s\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v20.4s, v20.4s, v23.4s\n" "fadd v19.4s, v19.4s, v23.4s\n" "fadd v18.4s, v18.4s, v23.4s\n" "fadd v17.4s, v17.4s, v23.4s\n" "fadd v22.4s, v22.4s, v23.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "str h21, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h20, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h19, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h18, [x24, #0x0]\n" "add x24, x24, #0x2\n" "str h17, [x23, #0x0]\n" "add x23, x23, #0x2\n" "str h16, [x22, #0x0]\n" "add x22, x22, #0x2\n" "bne 55b\n" "add %x[in_ptr], x20, #0x180\n" "57:" // Initial: Height 7: no oddments "b 108f\n" "58:" // Initial: Height 8 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "mov x28, %x[bias]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x23, x24, %x[ldout], LSL #1\n" "add x22, x23, %x[ldout], LSL #1\n" "add x21, x22, %x[ldout], LSL #1\n" "blt 62f\n" "59:" // Initial: Height 8: Block loop "cbnz %x[bias], 60f\n" "movi v10.16b, #0x0\n" "movi v9.16b, #0x0\n" "movi v8.16b, #0x0\n" "b 61f\n" "60:" // Initial: Height 8: Width 3: bias "ldr d18, [x28, #0x0]\n" "ldr d17, [x28, #0x8]\n" "ldr d16, [x28, #0x10]\n" "shll v10.4s, v18.4h, #0x10\n" "shll v9.4s, v17.4h, #0x10\n" "shll v8.4s, v16.4h, #0x10\n" "61:" // Initial: Height 8: Width 3: init done "ldr q18, [%x[in_ptr], #0x0]\n" "ldr q17, [%x[in_ptr], #0x10]\n" "sub x10, x10, #0xc\n" "add x28, x28, #0x18\n" "ldr q16, [%x[in_ptr], #0x20]\n" "ldr q22, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q21, [%x[in_ptr], #0x40]\n" "ldr q20, [%x[in_ptr], #0x50]\n" "ldr q19, [%x[in_ptr], #0x60]\n" "ldr q7, [%x[in_ptr], #0x70]\n" "fadd v18.4s, v18.4s, v10.4s\n" "fadd v17.4s, v17.4s, v9.4s\n" "ldr q6, [%x[in_ptr], #0x80]\n" "ldr q5, [%x[in_ptr], #0x90]\n" "fadd v16.4s, v16.4s, v8.4s\n" "fadd v22.4s, v22.4s, v10.4s\n" "ldr q29, [%x[in_ptr], #0xa0]\n" "ldr q28, [%x[in_ptr], #0xb0]\n" "fadd v21.4s, v21.4s, v9.4s\n" "fadd v20.4s, v20.4s, v8.4s\n" "ldr q27, [%x[in_ptr], #0xc0]\n" "ldr q26, [%x[in_ptr], #0xd0]\n" "fadd v19.4s, v19.4s, v10.4s\n" "fadd v7.4s, v7.4s, v9.4s\n" "ldr q25, [%x[in_ptr], #0xe0]\n" "ldr q24, [%x[in_ptr], #0xf0]\n" "fadd v6.4s, v6.4s, v8.4s\n" "fadd v5.4s, v5.4s, v10.4s\n" "ldr q23, [%x[in_ptr], #0x100]\n" "ldr q4, [%x[in_ptr], #0x110]\n" "fadd v29.4s, v29.4s, v9.4s\n" "fadd v28.4s, v28.4s, v8.4s\n" "ldr q3, [%x[in_ptr], #0x120]\n" "ldr q2, [%x[in_ptr], #0x130]\n" "fadd v27.4s, v27.4s, v10.4s\n" "fadd v26.4s, v26.4s, v9.4s\n" "ldr q1, [%x[in_ptr], #0x140]\n" "ldr q0, [%x[in_ptr], #0x150]\n" "fadd v25.4s, v25.4s, v8.4s\n" "fadd v24.4s, v24.4s, v10.4s\n" "ldr q31, [%x[in_ptr], #0x160]\n" "ldr q30, [%x[in_ptr], #0x170]\n" "fadd v23.4s, v23.4s, v9.4s\n" "fadd v4.4s, v4.4s, v8.4s\n" "fadd v3.4s, v3.4s, v10.4s\n" "fadd v2.4s, v2.4s, v9.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v1.4s, v1.4s, v8.4s\n" "fadd v0.4s, v0.4s, v10.4s\n" "fadd v31.4s, v31.4s, v9.4s\n" "fadd v30.4s, v30.4s, v8.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v7.4s, v7.4s, v13.4s\n" "fmin v6.4s, v6.4s, v13.4s\n" "fmin v5.4s, v5.4s, v13.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v4.4s, v4.4s, v13.4s\n" "fmin v3.4s, v3.4s, v13.4s\n" "fmin v2.4s, v2.4s, v13.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v7.4s, v7.4s, v12.4s\n" "fmax v6.4s, v6.4s, v12.4s\n" "fmax v5.4s, v5.4s, v12.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v4.4s, v4.4s, v12.4s\n" "fmax v3.4s, v3.4s, v12.4s\n" "fmax v2.4s, v2.4s, v12.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16ad6 // bfcvtn v22.4h, v22.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" "str d18, [x9, #0x0]\n" "str d17, [x9, #0x8]\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea168f2 // bfcvtn v18.4h, v7.4s\n" "str d16, [x9, #0x10]\n" ".inst 0x0ea168d1 // bfcvtn v17.4h, v6.4s\n" ".inst 0x0ea168b0 // bfcvtn v16.4h, v5.4s\n" "add x9, x9, #0x18\n" "str d22, [x27, #0x0]\n" ".inst 0x0ea16bbd // bfcvtn v29.4h, v29.4s\n" ".inst 0x0ea16b9c // bfcvtn v28.4h, v28.4s\n" "str d21, [x27, #0x8]\n" ".inst 0x0ea16b7b // bfcvtn v27.4h, v27.4s\n" ".inst 0x0ea16b5a // bfcvtn v26.4h, v26.4s\n" "str d20, [x27, #0x10]\n" ".inst 0x0ea16b39 // bfcvtn v25.4h, v25.4s\n" ".inst 0x0ea16b18 // bfcvtn v24.4h, v24.4s\n" "add x27, x27, #0x18\n" "str d19, [x26, #0x0]\n" ".inst 0x0ea16af7 // bfcvtn v23.4h, v23.4s\n" ".inst 0x0ea16896 // bfcvtn v22.4h, v4.4s\n" "str d18, [x26, #0x8]\n" ".inst 0x0ea16875 // bfcvtn v21.4h, v3.4s\n" ".inst 0x0ea16854 // bfcvtn v20.4h, v2.4s\n" "str d17, [x26, #0x10]\n" ".inst 0x0ea16833 // bfcvtn v19.4h, v1.4s\n" ".inst 0x0ea16812 // bfcvtn v18.4h, v0.4s\n" "add x26, x26, #0x18\n" "str d16, [x25, #0x0]\n" ".inst 0x0ea16bf1 // bfcvtn v17.4h, v31.4s\n" ".inst 0x0ea16bd0 // bfcvtn v16.4h, v30.4s\n" "str d29, [x25, #0x8]\n" "str d28, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d27, [x24, #0x0]\n" "str d26, [x24, #0x8]\n" "str d25, [x24, #0x10]\n" "add x24, x24, #0x18\n" "str d24, [x23, #0x0]\n" "str d23, [x23, #0x8]\n" "str d22, [x23, #0x10]\n" "add x23, x23, #0x18\n" "str d21, [x22, #0x0]\n" "str d20, [x22, #0x8]\n" "str d19, [x22, #0x10]\n" "add x22, x22, #0x18\n" "str d18, [x21, #0x0]\n" "str d17, [x21, #0x8]\n" "str d16, [x21, #0x10]\n" "add x21, x21, #0x18\n" "bge 59b\n" "62:" // Initial: Height 8: no full blocks "cbz x10, 65f\n" "mov x20, %x[in_ptr]\n" "63:" // Initial: Height 8: Single loop "movi v24.16b, #0x0\n" "cbz %x[bias], 64f\n" "ldr h16, [x28, #0x0]\n" "shll v24.4s, v16.4h, #0x10\n" "64:" // Initial: Height 8: Scalar: no bias "ldr s17, [%x[in_ptr], #0x0]\n" "ldr s16, [%x[in_ptr], #0x30]\n" "subs x10, x10, #0x1\n" "add x28, x28, #0x2\n" "ldr s21, [%x[in_ptr], #0x60]\n" "ldr s20, [%x[in_ptr], #0x90]\n" "ldr s19, [%x[in_ptr], #0xc0]\n" "ldr s18, [%x[in_ptr], #0xf0]\n" "ldr s23, [%x[in_ptr], #0x120]\n" "ldr s22, [%x[in_ptr], #0x150]\n" "fadd v17.4s, v17.4s, v24.4s\n" "fadd v16.4s, v16.4s, v24.4s\n" "fadd v21.4s, v21.4s, v24.4s\n" "fadd v20.4s, v20.4s, v24.4s\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v19.4s, v19.4s, v24.4s\n" "fadd v18.4s, v18.4s, v24.4s\n" "fadd v23.4s, v23.4s, v24.4s\n" "fadd v22.4s, v22.4s, v24.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v16.4s, v16.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v16.4s, v16.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" ".inst 0x0ea16a10 // bfcvtn v16.4h, v16.4s\n" ".inst 0x0ea16ab5 // bfcvtn v21.4h, v21.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" "str h17, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h16, [x27, #0x0]\n" ".inst 0x0ea16af1 // bfcvtn v17.4h, v23.4s\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "add x27, x27, #0x2\n" "str h21, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h20, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h19, [x24, #0x0]\n" "add x24, x24, #0x2\n" "str h18, [x23, #0x0]\n" "add x23, x23, #0x2\n" "str h17, [x22, #0x0]\n" "add x22, x22, #0x2\n" "str h16, [x21, #0x0]\n" "add x21, x21, #0x2\n" "bne 63b\n" "add %x[in_ptr], x20, #0x180\n" "65:" // Initial: Height 8: no oddments "subs %x[rows], %x[rows], #0x8\n" "add %x[out_ptr], %x[out_ptr], x11\n" "bgt 1b\n" "b 108f\n" "66:" // Accumulate "67:" // Accumulate: Row loop "cmp %x[rows], #0x7\n" "bgt 103f\n" "beq 98f\n" "cmp %x[rows], #0x5\n" "bgt 93f\n" "beq 88f\n" "cmp %x[rows], #0x3\n" "bgt 83f\n" "beq 78f\n" "cmp %x[rows], #0x1\n" "bgt 73f\n" "68:" // Accumulate: Height 1 "mov x10, %x[cols]\n" "mov x9, %x[out_ptr]\n" "cmp x10, #0xc\n" "blt 70f\n" "69:" // Accumulate: Height 1: Block loop "ldr d16, [x9, #0x0]\n" "ldr q19, [%x[in_ptr], #0x0]\n" "sub x10, x10, #0xc\n" "ldr q18, [%x[in_ptr], #0x10]\n" "ldr q17, [%x[in_ptr], #0x20]\n" "cmp x10, #0xc\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v19.4s, v19.4s, v16.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" ".inst 0x0ea16a70 // bfcvtn v16.4h, v19.4s\n" "str d16, [x9, #0x0]\n" "ldr d16, [x9, #0x8]\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v18.4s, v18.4s, v16.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" ".inst 0x0ea16a50 // bfcvtn v16.4h, v18.4s\n" "str d16, [x9, #0x8]\n" "ldr d16, [x9, #0x10]\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v17.4s, v17.4s, v16.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" ".inst 0x0ea16a30 // bfcvtn v16.4h, v17.4s\n" "str d16, [x9, #0x10]\n" "add x9, x9, #0x18\n" "bge 69b\n" "70:" // Accumulate: Height 1: no full blocks "cbz x10, 72f\n" "mov x20, %x[in_ptr]\n" "71:" // Accumulate: Height 1: Single loop "ldr h16, [x9, #0x0]\n" "ldr s17, [%x[in_ptr], #0x0]\n" "subs x10, x10, #0x1\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v17.4s, v17.4s, v16.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" ".inst 0x0ea16a30 // bfcvtn v16.4h, v17.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" "bne 71b\n" "add %x[in_ptr], x20, #0x180\n" "72:" // Accumulate: Height 1: no oddments "b 108f\n" "73:" // Accumulate: Height 2 "mov x10, %x[cols]\n" "mov x9, %x[out_ptr]\n" "cmp x10, #0xc\n" "add x27, x9, %x[ldout], LSL #1\n" "blt 75f\n" "74:" // Accumulate: Height 2: Block loop "ldr d17, [x9, #0x0]\n" "ldr d16, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr q23, [%x[in_ptr], #0x0]\n" "ldr q22, [%x[in_ptr], #0x30]\n" "cmp x10, #0xc\n" "ldr q21, [%x[in_ptr], #0x10]\n" "ldr q20, [%x[in_ptr], #0x40]\n" "ldr q19, [%x[in_ptr], #0x20]\n" "ldr q18, [%x[in_ptr], #0x50]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v23.4s, v23.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16af0 // bfcvtn v16.4h, v23.4s\n" ".inst 0x0ea16ad1 // bfcvtn v17.4h, v22.4s\n" "str d16, [x9, #0x0]\n" "ldr d16, [x9, #0x8]\n" "str d17, [x27, #0x0]\n" "shll v17.4s, v16.4h, #0x10\n" "ldr d16, [x27, #0x8]\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v21.4s, v21.4s, v17.4s\n" "fadd v20.4s, v20.4s, v16.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" ".inst 0x0ea16a91 // bfcvtn v17.4h, v20.4s\n" "str d16, [x9, #0x8]\n" "ldr d16, [x9, #0x10]\n" "str d17, [x27, #0x8]\n" "shll v17.4s, v16.4h, #0x10\n" "ldr d16, [x27, #0x10]\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v19.4s, v19.4s, v17.4s\n" "fadd v18.4s, v18.4s, v16.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" ".inst 0x0ea16a71 // bfcvtn v17.4h, v19.4s\n" ".inst 0x0ea16a50 // bfcvtn v16.4h, v18.4s\n" "str d17, [x9, #0x10]\n" "add x9, x9, #0x18\n" "str d16, [x27, #0x10]\n" "add x27, x27, #0x18\n" "bge 74b\n" "75:" // Accumulate: Height 2: no full blocks "cbz x10, 77f\n" "mov x20, %x[in_ptr]\n" "76:" // Accumulate: Height 2: Single loop "ldr h17, [x9, #0x0]\n" "ldr h16, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr s19, [%x[in_ptr], #0x0]\n" "ldr s18, [%x[in_ptr], #0x30]\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v19.4s, v19.4s, v17.4s\n" "fadd v18.4s, v18.4s, v16.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" ".inst 0x0ea16a70 // bfcvtn v16.4h, v19.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16a50 // bfcvtn v16.4h, v18.4s\n" "str h16, [x27, #0x0]\n" "add x27, x27, #0x2\n" "bne 76b\n" "add %x[in_ptr], x20, #0x180\n" "77:" // Accumulate: Height 2: no oddments "b 108f\n" "78:" // Accumulate: Height 3 "mov x10, %x[cols]\n" "mov x9, %x[out_ptr]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "blt 80f\n" "79:" // Accumulate: Height 3: Block loop "ldr d18, [x9, #0x0]\n" "ldr d17, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr d16, [x26, #0x0]\n" "ldr q27, [%x[in_ptr], #0x0]\n" "cmp x10, #0xc\n" "ldr q26, [%x[in_ptr], #0x30]\n" "ldr q25, [%x[in_ptr], #0x60]\n" "ldr q24, [%x[in_ptr], #0x10]\n" "ldr q23, [%x[in_ptr], #0x40]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "ldr q22, [%x[in_ptr], #0x70]\n" "ldr q21, [%x[in_ptr], #0x20]\n" "shll v16.4s, v16.4h, #0x10\n" "ldr q20, [%x[in_ptr], #0x50]\n" "ldr q19, [%x[in_ptr], #0x80]\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fadd v27.4s, v27.4s, v18.4s\n" "fadd v26.4s, v26.4s, v17.4s\n" "fadd v25.4s, v25.4s, v16.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" ".inst 0x0ea16b72 // bfcvtn v18.4h, v27.4s\n" ".inst 0x0ea16b50 // bfcvtn v16.4h, v26.4s\n" ".inst 0x0ea16b31 // bfcvtn v17.4h, v25.4s\n" "str d18, [x9, #0x0]\n" "str d16, [x27, #0x0]\n" "ldr d16, [x9, #0x8]\n" "str d17, [x26, #0x0]\n" "ldr d17, [x27, #0x8]\n" "shll v18.4s, v16.4h, #0x10\n" "ldr d16, [x26, #0x8]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v24.4s, v24.4s, v18.4s\n" "fadd v23.4s, v23.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16b10 // bfcvtn v16.4h, v24.4s\n" ".inst 0x0ea16af2 // bfcvtn v18.4h, v23.4s\n" "str d16, [x9, #0x8]\n" ".inst 0x0ea16ad1 // bfcvtn v17.4h, v22.4s\n" "ldr d16, [x9, #0x10]\n" "str d18, [x27, #0x8]\n" "str d17, [x26, #0x8]\n" "shll v18.4s, v16.4h, #0x10\n" "ldr d17, [x27, #0x10]\n" "ldr d16, [x26, #0x10]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v21.4s, v21.4s, v18.4s\n" "fadd v20.4s, v20.4s, v17.4s\n" "fadd v19.4s, v19.4s, v16.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" ".inst 0x0ea16a91 // bfcvtn v17.4h, v20.4s\n" "str d16, [x9, #0x10]\n" "add x9, x9, #0x18\n" ".inst 0x0ea16a70 // bfcvtn v16.4h, v19.4s\n" "str d17, [x27, #0x10]\n" "add x27, x27, #0x18\n" "str d16, [x26, #0x10]\n" "add x26, x26, #0x18\n" "bge 79b\n" "80:" // Accumulate: Height 3: no full blocks "cbz x10, 82f\n" "mov x20, %x[in_ptr]\n" "81:" // Accumulate: Height 3: Single loop "ldr h18, [x9, #0x0]\n" "ldr h17, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr h16, [x26, #0x0]\n" "ldr s21, [%x[in_ptr], #0x0]\n" "ldr s20, [%x[in_ptr], #0x30]\n" "ldr s19, [%x[in_ptr], #0x60]\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v21.4s, v21.4s, v18.4s\n" "fadd v20.4s, v20.4s, v17.4s\n" "fadd v19.4s, v19.4s, v16.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" ".inst 0x0ea16a91 // bfcvtn v17.4h, v20.4s\n" "str h16, [x9, #0x0]\n" "add x9, x9, #0x2\n" ".inst 0x0ea16a70 // bfcvtn v16.4h, v19.4s\n" "str h17, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h16, [x26, #0x0]\n" "add x26, x26, #0x2\n" "bne 81b\n" "add %x[in_ptr], x20, #0x180\n" "82:" // Accumulate: Height 3: no oddments "b 108f\n" "83:" // Accumulate: Height 4 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "blt 85f\n" "84:" // Accumulate: Height 4: Block loop "ldr d19, [x9, #0x0]\n" "ldr d18, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr d17, [x26, #0x0]\n" "ldr d16, [x25, #0x0]\n" "cmp x10, #0xc\n" "ldr q31, [%x[in_ptr], #0x0]\n" "ldr q30, [%x[in_ptr], #0x30]\n" "ldr q29, [%x[in_ptr], #0x60]\n" "ldr q28, [%x[in_ptr], #0x90]\n" "shll v19.4s, v19.4h, #0x10\n" "shll v18.4s, v18.4h, #0x10\n" "ldr q27, [%x[in_ptr], #0x10]\n" "ldr q26, [%x[in_ptr], #0x40]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "ldr q25, [%x[in_ptr], #0x70]\n" "ldr q24, [%x[in_ptr], #0xa0]\n" "ldr q23, [%x[in_ptr], #0x20]\n" "ldr q22, [%x[in_ptr], #0x50]\n" "fadd v31.4s, v31.4s, v19.4s\n" "fadd v30.4s, v30.4s, v18.4s\n" "ldr q21, [%x[in_ptr], #0x80]\n" "ldr q20, [%x[in_ptr], #0xb0]\n" "fadd v29.4s, v29.4s, v17.4s\n" "fadd v28.4s, v28.4s, v16.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" ".inst 0x0ea16bf3 // bfcvtn v19.4h, v31.4s\n" ".inst 0x0ea16bd0 // bfcvtn v16.4h, v30.4s\n" ".inst 0x0ea16bb2 // bfcvtn v18.4h, v29.4s\n" ".inst 0x0ea16b91 // bfcvtn v17.4h, v28.4s\n" "str d19, [x9, #0x0]\n" "str d16, [x27, #0x0]\n" "ldr d16, [x9, #0x8]\n" "str d18, [x26, #0x0]\n" "str d17, [x25, #0x0]\n" "ldr d18, [x27, #0x8]\n" "shll v19.4s, v16.4h, #0x10\n" "ldr d17, [x26, #0x8]\n" "ldr d16, [x25, #0x8]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v27.4s, v27.4s, v19.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v26.4s, v26.4s, v18.4s\n" "fadd v25.4s, v25.4s, v17.4s\n" "fadd v24.4s, v24.4s, v16.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" ".inst 0x0ea16b71 // bfcvtn v17.4h, v27.4s\n" ".inst 0x0ea16b53 // bfcvtn v19.4h, v26.4s\n" ".inst 0x0ea16b30 // bfcvtn v16.4h, v25.4s\n" "str d17, [x9, #0x8]\n" ".inst 0x0ea16b12 // bfcvtn v18.4h, v24.4s\n" "ldr d17, [x9, #0x10]\n" "str d19, [x27, #0x8]\n" "str d16, [x26, #0x8]\n" "ldr d16, [x27, #0x10]\n" "str d18, [x25, #0x8]\n" "shll v19.4s, v17.4h, #0x10\n" "ldr d17, [x26, #0x10]\n" "shll v18.4s, v16.4h, #0x10\n" "ldr d16, [x25, #0x10]\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v23.4s, v23.4s, v19.4s\n" "fadd v22.4s, v22.4s, v18.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v21.4s, v21.4s, v17.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fadd v20.4s, v20.4s, v16.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" ".inst 0x0ea16af1 // bfcvtn v17.4h, v23.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "str d17, [x9, #0x10]\n" "add x9, x9, #0x18\n" ".inst 0x0ea16ab1 // bfcvtn v17.4h, v21.4s\n" "str d16, [x27, #0x10]\n" "add x27, x27, #0x18\n" ".inst 0x0ea16a90 // bfcvtn v16.4h, v20.4s\n" "str d17, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d16, [x25, #0x10]\n" "add x25, x25, #0x18\n" "bge 84b\n" "85:" // Accumulate: Height 4: no full blocks "cbz x10, 87f\n" "mov x20, %x[in_ptr]\n" "86:" // Accumulate: Height 4: Single loop "ldr h19, [x9, #0x0]\n" "ldr h18, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr h17, [x26, #0x0]\n" "ldr h16, [x25, #0x0]\n" "ldr s23, [%x[in_ptr], #0x0]\n" "ldr s22, [%x[in_ptr], #0x30]\n" "ldr s21, [%x[in_ptr], #0x60]\n" "ldr s20, [%x[in_ptr], #0x90]\n" "shll v19.4s, v19.4h, #0x10\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v23.4s, v23.4s, v19.4s\n" "fadd v22.4s, v22.4s, v18.4s\n" "fadd v21.4s, v21.4s, v17.4s\n" "fadd v20.4s, v20.4s, v16.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" ".inst 0x0ea16af3 // bfcvtn v19.4h, v23.4s\n" ".inst 0x0ea16ad2 // bfcvtn v18.4h, v22.4s\n" ".inst 0x0ea16ab1 // bfcvtn v17.4h, v21.4s\n" ".inst 0x0ea16a90 // bfcvtn v16.4h, v20.4s\n" "str h19, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h18, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h17, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h16, [x25, #0x0]\n" "add x25, x25, #0x2\n" "bne 86b\n" "add %x[in_ptr], x20, #0x180\n" "87:" // Accumulate: Height 4: no oddments "b 108f\n" "88:" // Accumulate: Height 5 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x24, x25, %x[ldout], LSL #1\n" "blt 90f\n" "89:" // Accumulate: Height 5: Block loop "ldr d20, [x9, #0x0]\n" "ldr d19, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr d18, [x26, #0x0]\n" "ldr d17, [x25, #0x0]\n" "cmp x10, #0xc\n" "ldr d16, [x24, #0x0]\n" "ldr q3, [%x[in_ptr], #0x0]\n" "ldr q2, [%x[in_ptr], #0x30]\n" "ldr q1, [%x[in_ptr], #0x60]\n" "shll v20.4s, v20.4h, #0x10\n" "shll v19.4s, v19.4h, #0x10\n" "ldr q0, [%x[in_ptr], #0x90]\n" "ldr q31, [%x[in_ptr], #0xc0]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "ldr q30, [%x[in_ptr], #0x10]\n" "ldr q29, [%x[in_ptr], #0x40]\n" "shll v16.4s, v16.4h, #0x10\n" "ldr q28, [%x[in_ptr], #0x70]\n" "ldr q27, [%x[in_ptr], #0xa0]\n" "fadd v3.4s, v3.4s, v20.4s\n" "fadd v2.4s, v2.4s, v19.4s\n" "ldr q26, [%x[in_ptr], #0xd0]\n" "ldr q25, [%x[in_ptr], #0x20]\n" "fadd v1.4s, v1.4s, v18.4s\n" "fadd v0.4s, v0.4s, v17.4s\n" "ldr q24, [%x[in_ptr], #0x50]\n" "ldr q23, [%x[in_ptr], #0x80]\n" "fadd v31.4s, v31.4s, v16.4s\n" "ldr q22, [%x[in_ptr], #0xb0]\n" "ldr q21, [%x[in_ptr], #0xe0]\n" "fmin v3.4s, v3.4s, v13.4s\n" "fmin v2.4s, v2.4s, v13.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmax v3.4s, v3.4s, v12.4s\n" "fmax v2.4s, v2.4s, v12.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" ".inst 0x0ea16874 // bfcvtn v20.4h, v3.4s\n" ".inst 0x0ea16853 // bfcvtn v19.4h, v2.4s\n" ".inst 0x0ea16831 // bfcvtn v17.4h, v1.4s\n" ".inst 0x0ea16810 // bfcvtn v16.4h, v0.4s\n" ".inst 0x0ea16bf2 // bfcvtn v18.4h, v31.4s\n" "str d20, [x9, #0x0]\n" "str d19, [x27, #0x0]\n" "str d17, [x26, #0x0]\n" "ldr d17, [x9, #0x8]\n" "str d16, [x25, #0x0]\n" "ldr d16, [x27, #0x8]\n" "str d18, [x24, #0x0]\n" "ldr d18, [x26, #0x8]\n" "shll v20.4s, v17.4h, #0x10\n" "ldr d17, [x25, #0x8]\n" "shll v19.4s, v16.4h, #0x10\n" "ldr d16, [x24, #0x8]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v30.4s, v30.4s, v20.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v29.4s, v29.4s, v19.4s\n" "fadd v28.4s, v28.4s, v18.4s\n" "fadd v27.4s, v27.4s, v17.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fadd v26.4s, v26.4s, v16.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" ".inst 0x0ea16bd2 // bfcvtn v18.4h, v30.4s\n" ".inst 0x0ea16bb3 // bfcvtn v19.4h, v29.4s\n" ".inst 0x0ea16b91 // bfcvtn v17.4h, v28.4s\n" ".inst 0x0ea16b70 // bfcvtn v16.4h, v27.4s\n" "str d18, [x9, #0x8]\n" ".inst 0x0ea16b52 // bfcvtn v18.4h, v26.4s\n" "str d19, [x27, #0x8]\n" "str d17, [x26, #0x8]\n" "ldr d17, [x9, #0x10]\n" "str d16, [x25, #0x8]\n" "ldr d16, [x27, #0x10]\n" "str d18, [x24, #0x8]\n" "ldr d18, [x26, #0x10]\n" "shll v20.4s, v17.4h, #0x10\n" "ldr d17, [x25, #0x10]\n" "shll v19.4s, v16.4h, #0x10\n" "ldr d16, [x24, #0x10]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v25.4s, v25.4s, v20.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v24.4s, v24.4s, v19.4s\n" "fadd v23.4s, v23.4s, v18.4s\n" "fadd v22.4s, v22.4s, v17.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fadd v21.4s, v21.4s, v16.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" ".inst 0x0ea16b30 // bfcvtn v16.4h, v25.4s\n" ".inst 0x0ea16b13 // bfcvtn v19.4h, v24.4s\n" ".inst 0x0ea16af2 // bfcvtn v18.4h, v23.4s\n" ".inst 0x0ea16ad1 // bfcvtn v17.4h, v22.4s\n" "str d16, [x9, #0x10]\n" "add x9, x9, #0x18\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" "str d19, [x27, #0x10]\n" "add x27, x27, #0x18\n" "str d18, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d17, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d16, [x24, #0x10]\n" "add x24, x24, #0x18\n" "bge 89b\n" "90:" // Accumulate: Height 5: no full blocks "cbz x10, 92f\n" "mov x20, %x[in_ptr]\n" "91:" // Accumulate: Height 5: Single loop "ldr h20, [x9, #0x0]\n" "ldr h19, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr h18, [x26, #0x0]\n" "ldr h17, [x25, #0x0]\n" "ldr h16, [x24, #0x0]\n" "ldr s25, [%x[in_ptr], #0x0]\n" "ldr s24, [%x[in_ptr], #0x30]\n" "ldr s23, [%x[in_ptr], #0x60]\n" "shll v20.4s, v20.4h, #0x10\n" "shll v19.4s, v19.4h, #0x10\n" "ldr s22, [%x[in_ptr], #0x90]\n" "ldr s21, [%x[in_ptr], #0xc0]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v25.4s, v25.4s, v20.4s\n" "fadd v24.4s, v24.4s, v19.4s\n" "fadd v23.4s, v23.4s, v18.4s\n" "fadd v22.4s, v22.4s, v17.4s\n" "fadd v21.4s, v21.4s, v16.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmin v21.4s, v21.4s, v13.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" "fmax v21.4s, v21.4s, v12.4s\n" ".inst 0x0ea16b34 // bfcvtn v20.4h, v25.4s\n" ".inst 0x0ea16b13 // bfcvtn v19.4h, v24.4s\n" ".inst 0x0ea16af2 // bfcvtn v18.4h, v23.4s\n" ".inst 0x0ea16ad1 // bfcvtn v17.4h, v22.4s\n" ".inst 0x0ea16ab0 // bfcvtn v16.4h, v21.4s\n" "str h20, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h19, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h18, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h17, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h16, [x24, #0x0]\n" "add x24, x24, #0x2\n" "bne 91b\n" "add %x[in_ptr], x20, #0x180\n" "92:" // Accumulate: Height 5: no oddments "b 108f\n" "93:" // Accumulate: Height 6 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x23, x24, %x[ldout], LSL #1\n" "blt 95f\n" "94:" // Accumulate: Height 6: Block loop "ldr d21, [x9, #0x0]\n" "ldr d20, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr d19, [x26, #0x0]\n" "ldr d18, [x25, #0x0]\n" "cmp x10, #0xc\n" "ldr d17, [x24, #0x0]\n" "ldr d16, [x23, #0x0]\n" "ldr q6, [%x[in_ptr], #0x0]\n" "ldr q5, [%x[in_ptr], #0x30]\n" "shll v22.4s, v21.4h, #0x10\n" "shll v21.4s, v20.4h, #0x10\n" "ldr q4, [%x[in_ptr], #0x60]\n" "ldr q3, [%x[in_ptr], #0x90]\n" "shll v20.4s, v19.4h, #0x10\n" "shll v18.4s, v18.4h, #0x10\n" "ldr q2, [%x[in_ptr], #0xc0]\n" "ldr q19, [%x[in_ptr], #0xf0]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "ldr q1, [%x[in_ptr], #0x10]\n" "ldr q0, [%x[in_ptr], #0x40]\n" "fadd v6.4s, v6.4s, v22.4s\n" "fadd v5.4s, v5.4s, v21.4s\n" "ldr q31, [%x[in_ptr], #0x70]\n" "ldr q30, [%x[in_ptr], #0xa0]\n" "fadd v4.4s, v4.4s, v20.4s\n" "fadd v3.4s, v3.4s, v18.4s\n" "ldr q29, [%x[in_ptr], #0xd0]\n" "ldr q28, [%x[in_ptr], #0x100]\n" "fadd v2.4s, v2.4s, v17.4s\n" "fadd v19.4s, v19.4s, v16.4s\n" "ldr q27, [%x[in_ptr], #0x20]\n" "ldr q26, [%x[in_ptr], #0x50]\n" "fmin v6.4s, v6.4s, v13.4s\n" "fmin v5.4s, v5.4s, v13.4s\n" "ldr q25, [%x[in_ptr], #0x80]\n" "ldr q24, [%x[in_ptr], #0xb0]\n" "fmin v4.4s, v4.4s, v13.4s\n" "fmin v3.4s, v3.4s, v13.4s\n" "ldr q23, [%x[in_ptr], #0xe0]\n" "ldr q22, [%x[in_ptr], #0x110]\n" "fmin v2.4s, v2.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmax v6.4s, v6.4s, v12.4s\n" "fmax v5.4s, v5.4s, v12.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fmax v4.4s, v4.4s, v12.4s\n" "fmax v3.4s, v3.4s, v12.4s\n" "fmax v2.4s, v2.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" ".inst 0x0ea168d5 // bfcvtn v21.4h, v6.4s\n" ".inst 0x0ea168b4 // bfcvtn v20.4h, v5.4s\n" ".inst 0x0ea16892 // bfcvtn v18.4h, v4.4s\n" ".inst 0x0ea16871 // bfcvtn v17.4h, v3.4s\n" ".inst 0x0ea16850 // bfcvtn v16.4h, v2.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" "str d21, [x9, #0x0]\n" "str d20, [x27, #0x0]\n" "str d18, [x26, #0x0]\n" "ldr d18, [x9, #0x8]\n" "str d17, [x25, #0x0]\n" "ldr d17, [x27, #0x8]\n" "str d16, [x24, #0x0]\n" "ldr d16, [x26, #0x8]\n" "str d19, [x23, #0x0]\n" "shll v21.4s, v18.4h, #0x10\n" "ldr d18, [x25, #0x8]\n" "shll v20.4s, v17.4h, #0x10\n" "ldr d17, [x24, #0x8]\n" "shll v19.4s, v16.4h, #0x10\n" "ldr d16, [x23, #0x8]\n" "shll v18.4s, v18.4h, #0x10\n" "fadd v1.4s, v1.4s, v21.4s\n" "fadd v0.4s, v0.4s, v20.4s\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v31.4s, v31.4s, v19.4s\n" "fadd v30.4s, v30.4s, v18.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "fadd v29.4s, v29.4s, v17.4s\n" "fadd v28.4s, v28.4s, v16.4s\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" ".inst 0x0ea16832 // bfcvtn v18.4h, v1.4s\n" ".inst 0x0ea16810 // bfcvtn v16.4h, v0.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" ".inst 0x0ea16bf4 // bfcvtn v20.4h, v31.4s\n" ".inst 0x0ea16bd1 // bfcvtn v17.4h, v30.4s\n" "str d18, [x9, #0x8]\n" "str d16, [x27, #0x8]\n" ".inst 0x0ea16bb3 // bfcvtn v19.4h, v29.4s\n" ".inst 0x0ea16b92 // bfcvtn v18.4h, v28.4s\n" "ldr d16, [x9, #0x10]\n" "str d20, [x26, #0x8]\n" "str d17, [x25, #0x8]\n" "ldr d17, [x27, #0x10]\n" "str d19, [x24, #0x8]\n" "shll v21.4s, v16.4h, #0x10\n" "ldr d16, [x26, #0x10]\n" "str d18, [x23, #0x8]\n" "ldr d18, [x25, #0x10]\n" "shll v20.4s, v17.4h, #0x10\n" "ldr d17, [x24, #0x10]\n" "shll v19.4s, v16.4h, #0x10\n" "fadd v27.4s, v27.4s, v21.4s\n" "ldr d16, [x23, #0x10]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v26.4s, v26.4s, v20.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v25.4s, v25.4s, v19.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fadd v24.4s, v24.4s, v18.4s\n" "fadd v23.4s, v23.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" ".inst 0x0ea16b71 // bfcvtn v17.4h, v27.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16b50 // bfcvtn v16.4h, v26.4s\n" "str d17, [x9, #0x10]\n" "add x9, x9, #0x18\n" ".inst 0x0ea16b33 // bfcvtn v19.4h, v25.4s\n" ".inst 0x0ea16b12 // bfcvtn v18.4h, v24.4s\n" ".inst 0x0ea16af1 // bfcvtn v17.4h, v23.4s\n" "str d16, [x27, #0x10]\n" "add x27, x27, #0x18\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "str d19, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d18, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d17, [x24, #0x10]\n" "add x24, x24, #0x18\n" "str d16, [x23, #0x10]\n" "add x23, x23, #0x18\n" "bge 94b\n" "95:" // Accumulate: Height 6: no full blocks "cbz x10, 97f\n" "mov x20, %x[in_ptr]\n" "96:" // Accumulate: Height 6: Single loop "ldr h21, [x9, #0x0]\n" "ldr h20, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr h19, [x26, #0x0]\n" "ldr h18, [x25, #0x0]\n" "ldr h17, [x24, #0x0]\n" "ldr h16, [x23, #0x0]\n" "ldr s27, [%x[in_ptr], #0x0]\n" "ldr s26, [%x[in_ptr], #0x30]\n" "shll v21.4s, v21.4h, #0x10\n" "shll v20.4s, v20.4h, #0x10\n" "ldr s25, [%x[in_ptr], #0x60]\n" "ldr s24, [%x[in_ptr], #0x90]\n" "shll v19.4s, v19.4h, #0x10\n" "shll v18.4s, v18.4h, #0x10\n" "ldr s23, [%x[in_ptr], #0xc0]\n" "ldr s22, [%x[in_ptr], #0xf0]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v27.4s, v27.4s, v21.4s\n" "fadd v26.4s, v26.4s, v20.4s\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v25.4s, v25.4s, v19.4s\n" "fadd v24.4s, v24.4s, v18.4s\n" "fadd v23.4s, v23.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16b75 // bfcvtn v21.4h, v27.4s\n" ".inst 0x0ea16b54 // bfcvtn v20.4h, v26.4s\n" ".inst 0x0ea16b33 // bfcvtn v19.4h, v25.4s\n" ".inst 0x0ea16b12 // bfcvtn v18.4h, v24.4s\n" ".inst 0x0ea16af1 // bfcvtn v17.4h, v23.4s\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "str h21, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h20, [x27, #0x0]\n" "add x27, x27, #0x2\n" "str h19, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h18, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h17, [x24, #0x0]\n" "add x24, x24, #0x2\n" "str h16, [x23, #0x0]\n" "add x23, x23, #0x2\n" "bne 96b\n" "add %x[in_ptr], x20, #0x180\n" "97:" // Accumulate: Height 6: no oddments "b 108f\n" "98:" // Accumulate: Height 7 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "add x23, x24, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x22, x23, %x[ldout], LSL #1\n" "blt 100f\n" "99:" // Accumulate: Height 7: Block loop "ldr d22, [x9, #0x0]\n" "ldr d21, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr d20, [x26, #0x0]\n" "ldr d19, [x25, #0x0]\n" "cmp x10, #0xc\n" "ldr d18, [x24, #0x0]\n" "ldr d17, [x23, #0x0]\n" "ldr d16, [x22, #0x0]\n" "ldr q9, [%x[in_ptr], #0x0]\n" "shll v24.4s, v22.4h, #0x10\n" "shll v23.4s, v21.4h, #0x10\n" "ldr q8, [%x[in_ptr], #0x30]\n" "ldr q7, [%x[in_ptr], #0x60]\n" "shll v21.4s, v20.4h, #0x10\n" "shll v19.4s, v19.4h, #0x10\n" "ldr q6, [%x[in_ptr], #0x90]\n" "ldr q5, [%x[in_ptr], #0xc0]\n" "shll v18.4s, v18.4h, #0x10\n" "shll v17.4s, v17.4h, #0x10\n" "ldr q20, [%x[in_ptr], #0xf0]\n" "ldr q22, [%x[in_ptr], #0x120]\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v9.4s, v9.4s, v24.4s\n" "ldr q4, [%x[in_ptr], #0x10]\n" "ldr q3, [%x[in_ptr], #0x40]\n" "fadd v8.4s, v8.4s, v23.4s\n" "fadd v7.4s, v7.4s, v21.4s\n" "ldr q2, [%x[in_ptr], #0x70]\n" "ldr q1, [%x[in_ptr], #0xa0]\n" "fadd v6.4s, v6.4s, v19.4s\n" "fadd v5.4s, v5.4s, v18.4s\n" "ldr q0, [%x[in_ptr], #0xd0]\n" "ldr q31, [%x[in_ptr], #0x100]\n" "fadd v20.4s, v20.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "ldr q30, [%x[in_ptr], #0x130]\n" "ldr q29, [%x[in_ptr], #0x20]\n" "fmin v9.4s, v9.4s, v13.4s\n" "fmin v8.4s, v8.4s, v13.4s\n" "ldr q28, [%x[in_ptr], #0x50]\n" "ldr q27, [%x[in_ptr], #0x80]\n" "fmin v7.4s, v7.4s, v13.4s\n" "fmin v6.4s, v6.4s, v13.4s\n" "ldr q26, [%x[in_ptr], #0xb0]\n" "ldr q25, [%x[in_ptr], #0xe0]\n" "fmin v5.4s, v5.4s, v13.4s\n" "fmin v20.4s, v20.4s, v13.4s\n" "ldr q24, [%x[in_ptr], #0x110]\n" "ldr q23, [%x[in_ptr], #0x140]\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v9.4s, v9.4s, v12.4s\n" "fmax v8.4s, v8.4s, v12.4s\n" "fmax v7.4s, v7.4s, v12.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fmax v6.4s, v6.4s, v12.4s\n" "fmax v5.4s, v5.4s, v12.4s\n" "fmax v20.4s, v20.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16935 // bfcvtn v21.4h, v9.4s\n" ".inst 0x0ea16913 // bfcvtn v19.4h, v8.4s\n" ".inst 0x0ea168f0 // bfcvtn v16.4h, v7.4s\n" ".inst 0x0ea168d2 // bfcvtn v18.4h, v6.4s\n" ".inst 0x0ea168b1 // bfcvtn v17.4h, v5.4s\n" ".inst 0x0ea16a94 // bfcvtn v20.4h, v20.4s\n" "str d21, [x9, #0x0]\n" "str d19, [x27, #0x0]\n" ".inst 0x0ea16ad3 // bfcvtn v19.4h, v22.4s\n" "str d16, [x26, #0x0]\n" "ldr d16, [x9, #0x8]\n" "str d18, [x25, #0x0]\n" "ldr d18, [x27, #0x8]\n" "str d17, [x24, #0x0]\n" "ldr d17, [x26, #0x8]\n" "str d20, [x23, #0x0]\n" "shll v22.4s, v16.4h, #0x10\n" "ldr d16, [x25, #0x8]\n" "str d19, [x22, #0x0]\n" "shll v21.4s, v18.4h, #0x10\n" "ldr d18, [x24, #0x8]\n" "shll v20.4s, v17.4h, #0x10\n" "ldr d17, [x23, #0x8]\n" "shll v19.4s, v16.4h, #0x10\n" "fadd v4.4s, v4.4s, v22.4s\n" "ldr d16, [x22, #0x8]\n" "shll v18.4s, v18.4h, #0x10\n" "fadd v3.4s, v3.4s, v21.4s\n" "fadd v2.4s, v2.4s, v20.4s\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v1.4s, v1.4s, v19.4s\n" "fadd v0.4s, v0.4s, v18.4s\n" "fmin v4.4s, v4.4s, v13.4s\n" "fadd v31.4s, v31.4s, v17.4s\n" "fmin v3.4s, v3.4s, v13.4s\n" "fadd v30.4s, v30.4s, v16.4s\n" "fmin v2.4s, v2.4s, v13.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmax v4.4s, v4.4s, v12.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmax v3.4s, v3.4s, v12.4s\n" "fmax v2.4s, v2.4s, v12.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" ".inst 0x0ea16893 // bfcvtn v19.4h, v4.4s\n" ".inst 0x0ea16875 // bfcvtn v21.4h, v3.4s\n" ".inst 0x0ea16850 // bfcvtn v16.4h, v2.4s\n" ".inst 0x0ea16832 // bfcvtn v18.4h, v1.4s\n" ".inst 0x0ea16811 // bfcvtn v17.4h, v0.4s\n" "str d19, [x9, #0x8]\n" ".inst 0x0ea16bf4 // bfcvtn v20.4h, v31.4s\n" ".inst 0x0ea16bd3 // bfcvtn v19.4h, v30.4s\n" "str d21, [x27, #0x8]\n" "str d16, [x26, #0x8]\n" "ldr d16, [x9, #0x10]\n" "str d18, [x25, #0x8]\n" "ldr d18, [x27, #0x10]\n" "str d17, [x24, #0x8]\n" "ldr d17, [x26, #0x10]\n" "str d20, [x23, #0x8]\n" "shll v22.4s, v16.4h, #0x10\n" "ldr d16, [x25, #0x10]\n" "str d19, [x22, #0x8]\n" "shll v21.4s, v18.4h, #0x10\n" "ldr d18, [x24, #0x10]\n" "shll v20.4s, v17.4h, #0x10\n" "ldr d17, [x23, #0x10]\n" "shll v19.4s, v16.4h, #0x10\n" "fadd v29.4s, v29.4s, v22.4s\n" "ldr d16, [x22, #0x10]\n" "shll v18.4s, v18.4h, #0x10\n" "fadd v28.4s, v28.4s, v21.4s\n" "fadd v27.4s, v27.4s, v20.4s\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v26.4s, v26.4s, v19.4s\n" "fadd v25.4s, v25.4s, v18.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fadd v24.4s, v24.4s, v17.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fadd v23.4s, v23.4s, v16.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" ".inst 0x0ea16bb0 // bfcvtn v16.4h, v29.4s\n" ".inst 0x0ea16b95 // bfcvtn v21.4h, v28.4s\n" ".inst 0x0ea16b74 // bfcvtn v20.4h, v27.4s\n" ".inst 0x0ea16b53 // bfcvtn v19.4h, v26.4s\n" ".inst 0x0ea16b32 // bfcvtn v18.4h, v25.4s\n" "str d16, [x9, #0x10]\n" "add x9, x9, #0x18\n" ".inst 0x0ea16b11 // bfcvtn v17.4h, v24.4s\n" ".inst 0x0ea16af0 // bfcvtn v16.4h, v23.4s\n" "str d21, [x27, #0x10]\n" "add x27, x27, #0x18\n" "str d20, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d19, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d18, [x24, #0x10]\n" "add x24, x24, #0x18\n" "str d17, [x23, #0x10]\n" "add x23, x23, #0x18\n" "str d16, [x22, #0x10]\n" "add x22, x22, #0x18\n" "bge 99b\n" "100:" // Accumulate: Height 7: no full blocks "cbz x10, 102f\n" "mov x20, %x[in_ptr]\n" "101:" // Accumulate: Height 7: Single loop "ldr h22, [x9, #0x0]\n" "ldr h21, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr h20, [x26, #0x0]\n" "ldr h19, [x25, #0x0]\n" "ldr h18, [x24, #0x0]\n" "ldr h17, [x23, #0x0]\n" "ldr h16, [x22, #0x0]\n" "ldr s29, [%x[in_ptr], #0x0]\n" "shll v28.4s, v22.4h, #0x10\n" "shll v27.4s, v21.4h, #0x10\n" "ldr s26, [%x[in_ptr], #0x30]\n" "ldr s25, [%x[in_ptr], #0x60]\n" "shll v21.4s, v20.4h, #0x10\n" "shll v20.4s, v19.4h, #0x10\n" "ldr s24, [%x[in_ptr], #0x90]\n" "ldr s23, [%x[in_ptr], #0xc0]\n" "shll v19.4s, v18.4h, #0x10\n" "shll v18.4s, v17.4h, #0x10\n" "ldr s17, [%x[in_ptr], #0xf0]\n" "ldr s22, [%x[in_ptr], #0x120]\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v29.4s, v29.4s, v28.4s\n" "fadd v26.4s, v26.4s, v27.4s\n" "fadd v25.4s, v25.4s, v21.4s\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v24.4s, v24.4s, v20.4s\n" "fadd v23.4s, v23.4s, v19.4s\n" "fadd v17.4s, v17.4s, v18.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v17.4s, v17.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v17.4s, v17.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16bb5 // bfcvtn v21.4h, v29.4s\n" ".inst 0x0ea16b50 // bfcvtn v16.4h, v26.4s\n" ".inst 0x0ea16b34 // bfcvtn v20.4h, v25.4s\n" ".inst 0x0ea16b13 // bfcvtn v19.4h, v24.4s\n" ".inst 0x0ea16af2 // bfcvtn v18.4h, v23.4s\n" ".inst 0x0ea16a31 // bfcvtn v17.4h, v17.4s\n" "str h21, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h16, [x27, #0x0]\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "add x27, x27, #0x2\n" "str h20, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h19, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h18, [x24, #0x0]\n" "add x24, x24, #0x2\n" "str h17, [x23, #0x0]\n" "add x23, x23, #0x2\n" "str h16, [x22, #0x0]\n" "add x22, x22, #0x2\n" "bne 101b\n" "add %x[in_ptr], x20, #0x180\n" "102:" // Accumulate: Height 7: no oddments "b 108f\n" "103:" // Accumulate: Height 8 "mov x9, %x[out_ptr]\n" "mov x10, %x[cols]\n" "add x27, x9, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "add x23, x24, %x[ldout], LSL #1\n" "cmp x10, #0xc\n" "add x22, x23, %x[ldout], LSL #1\n" "add x21, x22, %x[ldout], LSL #1\n" "blt 105f\n" "104:" // Accumulate: Height 8: Block loop "ldr d23, [x9, #0x0]\n" "ldr d22, [x27, #0x0]\n" "sub x10, x10, #0xc\n" "ldr d21, [x26, #0x0]\n" "ldr d20, [x25, #0x0]\n" "cmp x10, #0xc\n" "ldr d19, [x24, #0x0]\n" "ldr d18, [x23, #0x0]\n" "ldr d17, [x22, #0x0]\n" "ldr d16, [x21, #0x0]\n" "shll v26.4s, v23.4h, #0x10\n" "shll v25.4s, v22.4h, #0x10\n" "ldr q11, [%x[in_ptr], #0x0]\n" "ldr q10, [%x[in_ptr], #0x30]\n" "shll v24.4s, v21.4h, #0x10\n" "shll v23.4s, v20.4h, #0x10\n" "ldr q9, [%x[in_ptr], #0x60]\n" "ldr q8, [%x[in_ptr], #0x90]\n" "shll v21.4s, v19.4h, #0x10\n" "shll v20.4s, v18.4h, #0x10\n" "ldr q18, [%x[in_ptr], #0xc0]\n" "ldr q19, [%x[in_ptr], #0xf0]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "ldr q7, [%x[in_ptr], #0x120]\n" "ldr q22, [%x[in_ptr], #0x150]\n" "fadd v11.4s, v11.4s, v26.4s\n" "fadd v10.4s, v10.4s, v25.4s\n" "ldr q6, [%x[in_ptr], #0x10]\n" "ldr q5, [%x[in_ptr], #0x40]\n" "fadd v9.4s, v9.4s, v24.4s\n" "fadd v8.4s, v8.4s, v23.4s\n" "ldr q4, [%x[in_ptr], #0x70]\n" "ldr q3, [%x[in_ptr], #0xa0]\n" "fadd v18.4s, v18.4s, v21.4s\n" "fadd v19.4s, v19.4s, v20.4s\n" "ldr q2, [%x[in_ptr], #0xd0]\n" "ldr q1, [%x[in_ptr], #0x100]\n" "fadd v7.4s, v7.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "ldr q0, [%x[in_ptr], #0x130]\n" "ldr q31, [%x[in_ptr], #0x160]\n" "fmin v11.4s, v11.4s, v13.4s\n" "fmin v10.4s, v10.4s, v13.4s\n" "ldr q30, [%x[in_ptr], #0x20]\n" "ldr q29, [%x[in_ptr], #0x50]\n" "fmin v9.4s, v9.4s, v13.4s\n" "fmin v8.4s, v8.4s, v13.4s\n" "ldr q28, [%x[in_ptr], #0x80]\n" "ldr q27, [%x[in_ptr], #0xb0]\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "ldr q26, [%x[in_ptr], #0xe0]\n" "ldr q25, [%x[in_ptr], #0x110]\n" "fmin v7.4s, v7.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "ldr q24, [%x[in_ptr], #0x140]\n" "ldr q23, [%x[in_ptr], #0x170]\n" "fmax v11.4s, v11.4s, v12.4s\n" "fmax v10.4s, v10.4s, v12.4s\n" "fmax v9.4s, v9.4s, v12.4s\n" "fmax v8.4s, v8.4s, v12.4s\n" "add %x[in_ptr], %x[in_ptr], #0x180\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v7.4s, v7.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16975 // bfcvtn v21.4h, v11.4s\n" ".inst 0x0ea16954 // bfcvtn v20.4h, v10.4s\n" ".inst 0x0ea16931 // bfcvtn v17.4h, v9.4s\n" ".inst 0x0ea16910 // bfcvtn v16.4h, v8.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" "str d21, [x9, #0x0]\n" "str d20, [x27, #0x0]\n" ".inst 0x0ea168f5 // bfcvtn v21.4h, v7.4s\n" ".inst 0x0ea16ad4 // bfcvtn v20.4h, v22.4s\n" "str d17, [x26, #0x0]\n" "ldr d17, [x9, #0x8]\n" "str d16, [x25, #0x0]\n" "ldr d16, [x27, #0x8]\n" "str d18, [x24, #0x0]\n" "ldr d18, [x26, #0x8]\n" "str d19, [x23, #0x0]\n" "shll v19.4s, v17.4h, #0x10\n" "ldr d17, [x25, #0x8]\n" "str d21, [x22, #0x0]\n" "shll v22.4s, v16.4h, #0x10\n" "ldr d16, [x24, #0x8]\n" "str d20, [x21, #0x0]\n" "shll v21.4s, v18.4h, #0x10\n" "ldr d18, [x23, #0x8]\n" "shll v20.4s, v17.4h, #0x10\n" "fadd v6.4s, v6.4s, v19.4s\n" "ldr d17, [x22, #0x8]\n" "shll v19.4s, v16.4h, #0x10\n" "fadd v5.4s, v5.4s, v22.4s\n" "ldr d16, [x21, #0x8]\n" "shll v18.4s, v18.4h, #0x10\n" "fadd v4.4s, v4.4s, v21.4s\n" "fadd v3.4s, v3.4s, v20.4s\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v2.4s, v2.4s, v19.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v1.4s, v1.4s, v18.4s\n" "fmin v6.4s, v6.4s, v13.4s\n" "fmin v5.4s, v5.4s, v13.4s\n" "fadd v0.4s, v0.4s, v17.4s\n" "fmin v4.4s, v4.4s, v13.4s\n" "fadd v31.4s, v31.4s, v16.4s\n" "fmin v3.4s, v3.4s, v13.4s\n" "fmin v2.4s, v2.4s, v13.4s\n" "fmin v1.4s, v1.4s, v13.4s\n" "fmin v0.4s, v0.4s, v13.4s\n" "fmax v6.4s, v6.4s, v12.4s\n" "fmin v31.4s, v31.4s, v13.4s\n" "fmax v5.4s, v5.4s, v12.4s\n" "fmax v4.4s, v4.4s, v12.4s\n" "fmax v3.4s, v3.4s, v12.4s\n" "fmax v2.4s, v2.4s, v12.4s\n" "fmax v1.4s, v1.4s, v12.4s\n" "fmax v0.4s, v0.4s, v12.4s\n" "fmax v31.4s, v31.4s, v12.4s\n" ".inst 0x0ea168d5 // bfcvtn v21.4h, v6.4s\n" ".inst 0x0ea168b4 // bfcvtn v20.4h, v5.4s\n" ".inst 0x0ea16891 // bfcvtn v17.4h, v4.4s\n" ".inst 0x0ea16870 // bfcvtn v16.4h, v3.4s\n" ".inst 0x0ea16852 // bfcvtn v18.4h, v2.4s\n" ".inst 0x0ea16833 // bfcvtn v19.4h, v1.4s\n" "str d21, [x9, #0x8]\n" "str d20, [x27, #0x8]\n" ".inst 0x0ea16815 // bfcvtn v21.4h, v0.4s\n" ".inst 0x0ea16bf4 // bfcvtn v20.4h, v31.4s\n" "str d17, [x26, #0x8]\n" "ldr d17, [x9, #0x10]\n" "str d16, [x25, #0x8]\n" "ldr d16, [x27, #0x10]\n" "str d18, [x24, #0x8]\n" "ldr d18, [x26, #0x10]\n" "str d19, [x23, #0x8]\n" "shll v19.4s, v17.4h, #0x10\n" "ldr d17, [x25, #0x10]\n" "str d21, [x22, #0x8]\n" "shll v22.4s, v16.4h, #0x10\n" "ldr d16, [x24, #0x10]\n" "str d20, [x21, #0x8]\n" "shll v21.4s, v18.4h, #0x10\n" "ldr d18, [x23, #0x10]\n" "shll v20.4s, v17.4h, #0x10\n" "fadd v30.4s, v30.4s, v19.4s\n" "ldr d17, [x22, #0x10]\n" "shll v19.4s, v16.4h, #0x10\n" "fadd v29.4s, v29.4s, v22.4s\n" "ldr d16, [x21, #0x10]\n" "shll v18.4s, v18.4h, #0x10\n" "fadd v28.4s, v28.4s, v21.4s\n" "fadd v27.4s, v27.4s, v20.4s\n" "shll v17.4s, v17.4h, #0x10\n" "fadd v26.4s, v26.4s, v19.4s\n" "shll v16.4s, v16.4h, #0x10\n" "fadd v25.4s, v25.4s, v18.4s\n" "fmin v30.4s, v30.4s, v13.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fadd v24.4s, v24.4s, v17.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fadd v23.4s, v23.4s, v16.4s\n" "fmin v27.4s, v27.4s, v13.4s\n" "fmin v26.4s, v26.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmax v30.4s, v30.4s, v12.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v27.4s, v27.4s, v12.4s\n" "fmax v26.4s, v26.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" ".inst 0x0ea16bd1 // bfcvtn v17.4h, v30.4s\n" ".inst 0x0ea16bb0 // bfcvtn v16.4h, v29.4s\n" ".inst 0x0ea16b95 // bfcvtn v21.4h, v28.4s\n" ".inst 0x0ea16b74 // bfcvtn v20.4h, v27.4s\n" ".inst 0x0ea16b53 // bfcvtn v19.4h, v26.4s\n" ".inst 0x0ea16b32 // bfcvtn v18.4h, v25.4s\n" "str d17, [x9, #0x10]\n" "add x9, x9, #0x18\n" "str d16, [x27, #0x10]\n" ".inst 0x0ea16b11 // bfcvtn v17.4h, v24.4s\n" ".inst 0x0ea16af0 // bfcvtn v16.4h, v23.4s\n" "add x27, x27, #0x18\n" "str d21, [x26, #0x10]\n" "add x26, x26, #0x18\n" "str d20, [x25, #0x10]\n" "add x25, x25, #0x18\n" "str d19, [x24, #0x10]\n" "add x24, x24, #0x18\n" "str d18, [x23, #0x10]\n" "add x23, x23, #0x18\n" "str d17, [x22, #0x10]\n" "add x22, x22, #0x18\n" "str d16, [x21, #0x10]\n" "add x21, x21, #0x18\n" "bge 104b\n" "105:" // Accumulate: Height 8: no full blocks "cbz x10, 107f\n" "mov x20, %x[in_ptr]\n" "106:" // Accumulate: Height 8: Single loop "ldr h23, [x9, #0x0]\n" "ldr h22, [x27, #0x0]\n" "subs x10, x10, #0x1\n" "ldr h21, [x26, #0x0]\n" "ldr h20, [x25, #0x0]\n" "ldr h19, [x24, #0x0]\n" "ldr h18, [x23, #0x0]\n" "ldr h17, [x22, #0x0]\n" "ldr h16, [x21, #0x0]\n" "shll v31.4s, v23.4h, #0x10\n" "shll v30.4s, v22.4h, #0x10\n" "ldr s29, [%x[in_ptr], #0x0]\n" "ldr s28, [%x[in_ptr], #0x30]\n" "shll v27.4s, v21.4h, #0x10\n" "shll v26.4s, v20.4h, #0x10\n" "ldr s25, [%x[in_ptr], #0x60]\n" "ldr s24, [%x[in_ptr], #0x90]\n" "shll v21.4s, v19.4h, #0x10\n" "shll v20.4s, v18.4h, #0x10\n" "ldr s19, [%x[in_ptr], #0xc0]\n" "ldr s18, [%x[in_ptr], #0xf0]\n" "shll v17.4s, v17.4h, #0x10\n" "shll v16.4s, v16.4h, #0x10\n" "ldr s23, [%x[in_ptr], #0x120]\n" "ldr s22, [%x[in_ptr], #0x150]\n" "fadd v29.4s, v29.4s, v31.4s\n" "fadd v28.4s, v28.4s, v30.4s\n" "fadd v25.4s, v25.4s, v27.4s\n" "fadd v24.4s, v24.4s, v26.4s\n" "add %x[in_ptr], %x[in_ptr], #0x4\n" "fadd v19.4s, v19.4s, v21.4s\n" "fadd v18.4s, v18.4s, v20.4s\n" "fadd v23.4s, v23.4s, v17.4s\n" "fadd v22.4s, v22.4s, v16.4s\n" "fmin v29.4s, v29.4s, v13.4s\n" "fmin v28.4s, v28.4s, v13.4s\n" "fmin v25.4s, v25.4s, v13.4s\n" "fmin v24.4s, v24.4s, v13.4s\n" "fmin v19.4s, v19.4s, v13.4s\n" "fmin v18.4s, v18.4s, v13.4s\n" "fmin v23.4s, v23.4s, v13.4s\n" "fmin v22.4s, v22.4s, v13.4s\n" "fmax v29.4s, v29.4s, v12.4s\n" "fmax v28.4s, v28.4s, v12.4s\n" "fmax v25.4s, v25.4s, v12.4s\n" "fmax v24.4s, v24.4s, v12.4s\n" "fmax v19.4s, v19.4s, v12.4s\n" "fmax v18.4s, v18.4s, v12.4s\n" "fmax v23.4s, v23.4s, v12.4s\n" "fmax v22.4s, v22.4s, v12.4s\n" ".inst 0x0ea16bb1 // bfcvtn v17.4h, v29.4s\n" ".inst 0x0ea16b90 // bfcvtn v16.4h, v28.4s\n" ".inst 0x0ea16b35 // bfcvtn v21.4h, v25.4s\n" ".inst 0x0ea16b14 // bfcvtn v20.4h, v24.4s\n" ".inst 0x0ea16a73 // bfcvtn v19.4h, v19.4s\n" ".inst 0x0ea16a52 // bfcvtn v18.4h, v18.4s\n" "str h17, [x9, #0x0]\n" "add x9, x9, #0x2\n" "str h16, [x27, #0x0]\n" ".inst 0x0ea16af1 // bfcvtn v17.4h, v23.4s\n" ".inst 0x0ea16ad0 // bfcvtn v16.4h, v22.4s\n" "add x27, x27, #0x2\n" "str h21, [x26, #0x0]\n" "add x26, x26, #0x2\n" "str h20, [x25, #0x0]\n" "add x25, x25, #0x2\n" "str h19, [x24, #0x0]\n" "add x24, x24, #0x2\n" "str h18, [x23, #0x0]\n" "add x23, x23, #0x2\n" "str h17, [x22, #0x0]\n" "add x22, x22, #0x2\n" "str h16, [x21, #0x0]\n" "add x21, x21, #0x2\n" "bne 106b\n" "add %x[in_ptr], x20, #0x180\n" "107:" // Accumulate: Height 8: no oddments "subs %x[rows], %x[rows], #0x8\n" "add %x[out_ptr], %x[out_ptr], x11\n" "bgt 67b\n" "108:" // Exit : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows) : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } #endif // __aarch64__