/* * Copyright (c) 2024 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #ifdef ARM_COMPUTE_ENABLE_SVE template<> void MergeResults<3, 8, true>( bfloat16 *out_ptr, const float * in_ptr, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const bfloat16 *bias, Activation act, bool accumulate) { float maxval = static_cast(std::numeric_limits::infinity()); float minval = - static_cast(std::numeric_limits::infinity()); switch(act.type) { default: case Activation::Type::None: break; case Activation::Type::BoundedReLU: maxval = static_cast(act.param1); /* fall through */ case Activation::Type::ReLU: minval = 0; break; } size_t rows = ymax-y0; size_t cols = xmax-x0; out_ptr += (y0 * ldout) + x0; bias = (bias == nullptr) ? nullptr : bias + x0; __asm__ __volatile__( "ptrue p3.b\n" "cbz %x[cols], 52f\n" "cbz %x[rows], 52f\n" "mov x12, #0x20\n" "dup z12.s, %w[maxval]\n" "dup z11.s, %w[minval]\n" "mul x12, %x[ldout], x12\n" "cbnz %x[accumulate], 34f\n" "1:" // Initial: Row loop "cmp %x[rows], #0x7\n" "bgt 30f\n" "beq 26f\n" "cmp %x[rows], #0x5\n" "bgt 22f\n" "beq 18f\n" "cmp %x[rows], #0x3\n" "bgt 14f\n" "beq 10f\n" "cmp %x[rows], #0x1\n" "bgt 6f\n" "2:" // Initial: Height 1 "mov x11, %x[cols]\n" "mov x10, %x[out_ptr]\n" "mov x9, %x[bias]\n" "3:" // Initial: Height 1: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 4f\n" "mov z21.b, #0x0\n" "mov z20.b, #0x0\n" "mov z19.b, #0x0\n" "b 5f\n" "4:" // Initial: Height 1: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z21.s, z18.s, #0x10\n" "lsl z20.s, z17.s, #0x10\n" "lsl z19.s, z16.s, #0x10\n" "5:" // Initial: Height 1: Width 3: init done "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z18.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "fadd z17.s, z17.s, z21.s\n" "fadd z16.s, z16.s, z20.s\n" "cmp x11, XZR\n" "fadd z18.s, z18.s, z19.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" "st1h { z17.s }, p2, [x10]\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" "bgt 3b\n" "b 52f\n" "6:" // Initial: Height 2 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "7:" // Initial: Height 2: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 8f\n" "mov z24.b, #0x0\n" "mov z23.b, #0x0\n" "mov z22.b, #0x0\n" "b 9f\n" "8:" // Initial: Height 2: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z24.s, z18.s, #0x10\n" "lsl z23.s, z17.s, #0x10\n" "lsl z22.s, z16.s, #0x10\n" "9:" // Initial: Height 2: Width 3: init done "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "fadd z17.s, z17.s, z24.s\n" "fadd z16.s, z16.s, z23.s\n" "cmp x11, XZR\n" "fadd z19.s, z19.s, z22.s\n" "fadd z18.s, z18.s, z24.s\n" "fadd z21.s, z21.s, z23.s\n" "fadd z20.s, z20.s, z22.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "st1h { z17.s }, p2, [x10]\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aaeb1 // bfcvt z17.h, p3/M, z21.s\n" ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" "st1h { z19.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" "st1h { z18.s }, p2, [x28]\n" "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" "bgt 7b\n" "b 52f\n" "10:" // Initial: Height 3 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "11:" // Initial: Height 3: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 12f\n" "mov z27.b, #0x0\n" "mov z26.b, #0x0\n" "mov z25.b, #0x0\n" "b 13f\n" "12:" // Initial: Height 3: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z27.s, z18.s, #0x10\n" "lsl z26.s, z17.s, #0x10\n" "lsl z25.s, z16.s, #0x10\n" "13:" // Initial: Height 3: Width 3: init done "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z19.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z18.s, z18.s, z27.s\n" "fadd z17.s, z17.s, z26.s\n" "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n" "fadd z16.s, z16.s, z25.s\n" "fadd z21.s, z21.s, z27.s\n" "cmp x11, XZR\n" "fadd z20.s, z20.s, z26.s\n" "fadd z19.s, z19.s, z25.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "fadd z24.s, z24.s, z27.s\n" "fadd z23.s, z23.s, z26.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fadd z22.s, z22.s, z25.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" "st1h { z18.s }, p2, [x10]\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aaf12 // bfcvt z18.h, p3/M, z24.s\n" "st1h { z17.s }, p1, [x10, #1, MUL VL]\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" "inch x10, ALL, MUL #3\n" "st1h { z21.s }, p2, [x28]\n" "st1h { z20.s }, p1, [x28, #1, MUL VL]\n" "st1h { z19.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" "st1h { z18.s }, p2, [x27]\n" "st1h { z17.s }, p1, [x27, #1, MUL VL]\n" "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "bgt 11b\n" "b 52f\n" "14:" // Initial: Height 4 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "15:" // Initial: Height 4: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 16f\n" "mov z30.b, #0x0\n" "mov z29.b, #0x0\n" "mov z28.b, #0x0\n" "b 17f\n" "16:" // Initial: Height 4: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z30.s, z18.s, #0x10\n" "lsl z29.s, z17.s, #0x10\n" "lsl z28.s, z16.s, #0x10\n" "17:" // Initial: Height 4: Width 3: init done "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z24.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z20.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z18.s, z18.s, z30.s\n" "fadd z17.s, z17.s, z29.s\n" "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z27.s }, p2/Z, [x20, #-7, MUL VL]\n" "fadd z16.s, z16.s, z28.s\n" "fadd z24.s, z24.s, z30.s\n" "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n" "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n" "fadd z23.s, z23.s, z29.s\n" "fadd z22.s, z22.s, z28.s\n" "fadd z21.s, z21.s, z30.s\n" "fadd z20.s, z20.s, z29.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fadd z19.s, z19.s, z28.s\n" "fadd z27.s, z27.s, z30.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fadd z26.s, z26.s, z29.s\n" "fadd z25.s, z25.s, z28.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" "cmp x11, XZR\n" "st1h { z18.s }, p2, [x10]\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" "st1h { z17.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" "inch x10, ALL, MUL #3\n" "st1h { z24.s }, p2, [x28]\n" "st1h { z23.s }, p1, [x28, #1, MUL VL]\n" "st1h { z22.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" "st1h { z21.s }, p2, [x27]\n" "st1h { z20.s }, p1, [x27, #1, MUL VL]\n" "st1h { z19.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "st1h { z18.s }, p2, [x26]\n" "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" "st1h { z16.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "bgt 15b\n" "b 52f\n" "18:" // Initial: Height 5 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "19:" // Initial: Height 5: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 20f\n" "mov z1.b, #0x0\n" "mov z0.b, #0x0\n" "mov z31.b, #0x0\n" "b 21f\n" "20:" // Initial: Height 5: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z1.s, z18.s, #0x10\n" "lsl z0.s, z17.s, #0x10\n" "lsl z31.s, z16.s, #0x10\n" "21:" // Initial: Height 5: Width 3: init done "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z21.s, z21.s, z1.s\n" "fadd z20.s, z20.s, z0.s\n" "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z30.s }, p2/Z, [x20, #-7, MUL VL]\n" "fadd z19.s, z19.s, z31.s\n" "fadd z18.s, z18.s, z1.s\n" "ld1w { z29.s }, p1/Z, [x20, #-6, MUL VL]\n" "ld1w { z28.s }, p0/Z, [x20, #-5, MUL VL]\n" "fadd z17.s, z17.s, z0.s\n" "fadd z16.s, z16.s, z31.s\n" "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n" "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n" "fadd z24.s, z24.s, z1.s\n" "fadd z23.s, z23.s, z0.s\n" "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n" "fadd z22.s, z22.s, z31.s\n" "fadd z30.s, z30.s, z1.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fadd z29.s, z29.s, z0.s\n" "fadd z28.s, z28.s, z31.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fadd z27.s, z27.s, z1.s\n" "fadd z26.s, z26.s, z0.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fadd z25.s, z25.s, z31.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "st1h { z21.s }, p2, [x10]\n" ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "cmp x11, XZR\n" "st1h { z20.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" "st1h { z19.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" ".inst 0x658aafd5 // bfcvt z21.h, p3/M, z30.s\n" "inch x10, ALL, MUL #3\n" "st1h { z18.s }, p2, [x28]\n" ".inst 0x658aafb4 // bfcvt z20.h, p3/M, z29.s\n" ".inst 0x658aaf93 // bfcvt z19.h, p3/M, z28.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n" ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" "inch x28, ALL, MUL #3\n" "st1h { z24.s }, p2, [x27]\n" "st1h { z23.s }, p1, [x27, #1, MUL VL]\n" "st1h { z22.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "st1h { z21.s }, p2, [x26]\n" "st1h { z20.s }, p1, [x26, #1, MUL VL]\n" "st1h { z19.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z18.s }, p2, [x25]\n" "st1h { z17.s }, p1, [x25, #1, MUL VL]\n" "st1h { z16.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "bgt 19b\n" "b 52f\n" "22:" // Initial: Height 6 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "23:" // Initial: Height 6: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 24f\n" "mov z4.b, #0x0\n" "mov z3.b, #0x0\n" "mov z2.b, #0x0\n" "b 25f\n" "24:" // Initial: Height 6: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z4.s, z18.s, #0x10\n" "lsl z3.s, z17.s, #0x10\n" "lsl z2.s, z16.s, #0x10\n" "25:" // Initial: Height 6: Width 3: init done "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z20.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z19.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z1.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z0.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z17.s, z17.s, z4.s\n" "fadd z16.s, z16.s, z3.s\n" "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z24.s }, p2/Z, [x20, #-7, MUL VL]\n" "fadd z21.s, z21.s, z2.s\n" "fadd z20.s, z20.s, z4.s\n" "ld1w { z23.s }, p1/Z, [x20, #-6, MUL VL]\n" "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n" "fadd z19.s, z19.s, z3.s\n" "fadd z18.s, z18.s, z2.s\n" "ld1w { z31.s }, p2/Z, [x20, #-4, MUL VL]\n" "ld1w { z30.s }, p1/Z, [x20, #-3, MUL VL]\n" "fadd z1.s, z1.s, z4.s\n" "fadd z0.s, z0.s, z3.s\n" "ld1w { z29.s }, p0/Z, [x20, #-2, MUL VL]\n" "ld1w { z28.s }, p2/Z, [x20, #-1, MUL VL]\n" "fadd z25.s, z25.s, z2.s\n" "fadd z24.s, z24.s, z4.s\n" "ld1w { z27.s }, p1/Z, [x20]\n" "ld1w { z26.s }, p0/Z, [x20, #1, MUL VL]\n" "fadd z23.s, z23.s, z3.s\n" "fadd z22.s, z22.s, z2.s\n" "fadd z31.s, z31.s, z4.s\n" "fadd z30.s, z30.s, z3.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fadd z29.s, z29.s, z2.s\n" "fadd z28.s, z28.s, z4.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fadd z27.s, z27.s, z3.s\n" "fadd z26.s, z26.s, z2.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" "fmax z0.s, p3/M, z0.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "st1h { z17.s }, p2, [x10]\n" ".inst 0x658aac31 // bfcvt z17.h, p3/M, z1.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aac10 // bfcvt z16.h, p3/M, z0.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "st1h { z21.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n" "cmp x11, XZR\n" "st1h { z20.s }, p2, [x28]\n" ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" "st1h { z19.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" ".inst 0x658aaff5 // bfcvt z21.h, p3/M, z31.s\n" "inch x10, ALL, MUL #3\n" "st1h { z18.s }, p0, [x28, #2, MUL VL]\n" ".inst 0x658aafd4 // bfcvt z20.h, p3/M, z30.s\n" ".inst 0x658aafb3 // bfcvt z19.h, p3/M, z29.s\n" "inch x28, ALL, MUL #3\n" "st1h { z17.s }, p2, [x27]\n" ".inst 0x658aaf92 // bfcvt z18.h, p3/M, z28.s\n" ".inst 0x658aaf71 // bfcvt z17.h, p3/M, z27.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z16.s }, p1, [x27, #1, MUL VL]\n" ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n" "st1h { z25.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "st1h { z24.s }, p2, [x26]\n" "st1h { z23.s }, p1, [x26, #1, MUL VL]\n" "st1h { z22.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z21.s }, p2, [x25]\n" "st1h { z20.s }, p1, [x25, #1, MUL VL]\n" "st1h { z19.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "st1h { z18.s }, p2, [x24]\n" "st1h { z17.s }, p1, [x24, #1, MUL VL]\n" "st1h { z16.s }, p0, [x24, #2, MUL VL]\n" "inch x24, ALL, MUL #3\n" "bgt 23b\n" "b 52f\n" "26:" // Initial: Height 7 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "add x23, x24, %x[ldout], LSL #1\n" "27:" // Initial: Height 7: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 28f\n" "mov z7.b, #0x0\n" "mov z6.b, #0x0\n" "mov z5.b, #0x0\n" "b 29f\n" "28:" // Initial: Height 7: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z7.s, z18.s, #0x10\n" "lsl z6.s, z17.s, #0x10\n" "lsl z5.s, z16.s, #0x10\n" "29:" // Initial: Height 7: Width 3: init done "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z16.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z3.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z19.s, z19.s, z7.s\n" "fadd z18.s, z18.s, z6.s\n" "ld1w { z2.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z1.s }, p2/Z, [x20, #-7, MUL VL]\n" "fadd z17.s, z17.s, z5.s\n" "fadd z16.s, z16.s, z7.s\n" "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n" "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n" "fadd z21.s, z21.s, z6.s\n" "fadd z20.s, z20.s, z5.s\n" "ld1w { z24.s }, p2/Z, [x20, #-4, MUL VL]\n" "ld1w { z23.s }, p1/Z, [x20, #-3, MUL VL]\n" "fadd z4.s, z4.s, z7.s\n" "fadd z3.s, z3.s, z6.s\n" "ld1w { z22.s }, p0/Z, [x20, #-2, MUL VL]\n" "ld1w { z0.s }, p2/Z, [x20, #-1, MUL VL]\n" "fadd z2.s, z2.s, z5.s\n" "fadd z1.s, z1.s, z7.s\n" "ld1w { z31.s }, p1/Z, [x20]\n" "ld1w { z30.s }, p0/Z, [x20, #1, MUL VL]\n" "fadd z26.s, z26.s, z6.s\n" "fadd z25.s, z25.s, z5.s\n" "ld1w { z29.s }, p2/Z, [x20, #2, MUL VL]\n" "ld1w { z28.s }, p1/Z, [x20, #3, MUL VL]\n" "fadd z24.s, z24.s, z7.s\n" "fadd z23.s, z23.s, z6.s\n" "ld1w { z27.s }, p0/Z, [x20, #4, MUL VL]\n" "fadd z22.s, z22.s, z5.s\n" "fadd z0.s, z0.s, z7.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fadd z31.s, z31.s, z6.s\n" "fadd z30.s, z30.s, z5.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fadd z29.s, z29.s, z7.s\n" "fadd z28.s, z28.s, z6.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fadd z27.s, z27.s, z5.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmin z4.s, p3/M, z4.s, z12.s\n" "fmin z3.s, p3/M, z3.s, z12.s\n" "fmin z2.s, p3/M, z2.s, z12.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z4.s, p3/M, z4.s, z11.s\n" "fmax z3.s, p3/M, z3.s, z11.s\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "fmax z2.s, p3/M, z2.s, z11.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "st1h { z19.s }, p2, [x10]\n" ".inst 0x658aac93 // bfcvt z19.h, p3/M, z4.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" "fmax z0.s, p3/M, z0.s, z11.s\n" "st1h { z18.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aac72 // bfcvt z18.h, p3/M, z3.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aac51 // bfcvt z17.h, p3/M, z2.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "st1h { z16.s }, p2, [x28]\n" ".inst 0x658aac30 // bfcvt z16.h, p3/M, z1.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "cmp x11, XZR\n" "st1h { z21.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aaf5a // bfcvt z26.h, p3/M, z26.s\n" "st1h { z20.s }, p0, [x28, #2, MUL VL]\n" ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n" ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" "inch x10, ALL, MUL #3\n" "st1h { z19.s }, p2, [x27]\n" ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" "inch x28, ALL, MUL #3\n" "st1h { z18.s }, p1, [x27, #1, MUL VL]\n" ".inst 0x658aac15 // bfcvt z21.h, p3/M, z0.s\n" ".inst 0x658aaff4 // bfcvt z20.h, p3/M, z31.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z17.s }, p0, [x27, #2, MUL VL]\n" ".inst 0x658aafd3 // bfcvt z19.h, p3/M, z30.s\n" ".inst 0x658aafb2 // bfcvt z18.h, p3/M, z29.s\n" "inch x27, ALL, MUL #3\n" "st1h { z16.s }, p2, [x26]\n" ".inst 0x658aaf91 // bfcvt z17.h, p3/M, z28.s\n" ".inst 0x658aaf70 // bfcvt z16.h, p3/M, z27.s\n" "st1h { z26.s }, p1, [x26, #1, MUL VL]\n" "st1h { z25.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z24.s }, p2, [x25]\n" "st1h { z23.s }, p1, [x25, #1, MUL VL]\n" "st1h { z22.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "st1h { z21.s }, p2, [x24]\n" "st1h { z20.s }, p1, [x24, #1, MUL VL]\n" "st1h { z19.s }, p0, [x24, #2, MUL VL]\n" "inch x24, ALL, MUL #3\n" "st1h { z18.s }, p2, [x23]\n" "st1h { z17.s }, p1, [x23, #1, MUL VL]\n" "st1h { z16.s }, p0, [x23, #2, MUL VL]\n" "inch x23, ALL, MUL #3\n" "bgt 27b\n" "b 52f\n" "30:" // Initial: Height 8 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "mov x9, %x[bias]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "add x23, x24, %x[ldout], LSL #1\n" "add x22, x23, %x[ldout], LSL #1\n" "31:" // Initial: Height 8: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "whilelt p0.s, x21, x11\n" "incw x21\n" "cbnz %x[bias], 32f\n" "mov z10.b, #0x0\n" "mov z9.b, #0x0\n" "mov z8.b, #0x0\n" "b 33f\n" "32:" // Initial: Height 8: Width 3: bias "ld1h { z18.s }, p2/Z, [x9]\n" "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" "lsl z10.s, z18.s, #0x10\n" "lsl z9.s, z17.s, #0x10\n" "lsl z8.s, z16.s, #0x10\n" "33:" // Initial: Height 8: Width 3: init done "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "decw x11, ALL, MUL #3\n" "inch x9, ALL, MUL #3\n" "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z6.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z21.s, z21.s, z10.s\n" "fadd z20.s, z20.s, z9.s\n" "ld1w { z5.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z4.s }, p2/Z, [x20, #-7, MUL VL]\n" "fadd z19.s, z19.s, z8.s\n" "fadd z18.s, z18.s, z10.s\n" "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n" "ld1w { z2.s }, p0/Z, [x20, #-5, MUL VL]\n" "fadd z17.s, z17.s, z9.s\n" "fadd z16.s, z16.s, z8.s\n" "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n" "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n" "fadd z7.s, z7.s, z10.s\n" "fadd z6.s, z6.s, z9.s\n" "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n" "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n" "fadd z5.s, z5.s, z8.s\n" "fadd z4.s, z4.s, z10.s\n" "ld1w { z23.s }, p1/Z, [x20]\n" "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n" "fadd z3.s, z3.s, z9.s\n" "fadd z2.s, z2.s, z8.s\n" "ld1w { z1.s }, p2/Z, [x20, #2, MUL VL]\n" "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "fadd z27.s, z27.s, z10.s\n" "fadd z26.s, z26.s, z9.s\n" "ld1w { z31.s }, p0/Z, [x20, #4, MUL VL]\n" "ld1w { z30.s }, p2/Z, [x20, #5, MUL VL]\n" "fadd z25.s, z25.s, z8.s\n" "fadd z24.s, z24.s, z10.s\n" "ld1w { z29.s }, p1/Z, [x20, #6, MUL VL]\n" "ld1w { z28.s }, p0/Z, [x20, #7, MUL VL]\n" "fadd z23.s, z23.s, z9.s\n" "fadd z22.s, z22.s, z8.s\n" "fadd z1.s, z1.s, z10.s\n" "fadd z0.s, z0.s, z9.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fadd z31.s, z31.s, z8.s\n" "fadd z30.s, z30.s, z10.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fadd z29.s, z29.s, z9.s\n" "fadd z28.s, z28.s, z8.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fmin z16.s, p3/M, z16.s, z12.s\n" "fmin z7.s, p3/M, z7.s, z12.s\n" "fmin z6.s, p3/M, z6.s, z12.s\n" "fmin z5.s, p3/M, z5.s, z12.s\n" "fmin z4.s, p3/M, z4.s, z12.s\n" "fmin z3.s, p3/M, z3.s, z12.s\n" "fmin z2.s, p3/M, z2.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "fmax z16.s, p3/M, z16.s, z11.s\n" "fmax z7.s, p3/M, z7.s, z11.s\n" "fmax z6.s, p3/M, z6.s, z11.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" "fmax z5.s, p3/M, z5.s, z11.s\n" "fmax z4.s, p3/M, z4.s, z11.s\n" ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "fmax z3.s, p3/M, z3.s, z11.s\n" "fmax z2.s, p3/M, z2.s, z11.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "st1h { z21.s }, p2, [x10]\n" ".inst 0x658aacf5 // bfcvt z21.h, p3/M, z7.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "st1h { z20.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aacd4 // bfcvt z20.h, p3/M, z6.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" "st1h { z19.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aacb3 // bfcvt z19.h, p3/M, z5.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" "fmax z0.s, p3/M, z0.s, z11.s\n" "st1h { z18.s }, p2, [x28]\n" ".inst 0x658aac92 // bfcvt z18.h, p3/M, z4.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aac71 // bfcvt z17.h, p3/M, z3.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" ".inst 0x658aac50 // bfcvt z16.h, p3/M, z2.s\n" "cmp x11, XZR\n" "st1h { z21.s }, p2, [x27]\n" ".inst 0x658aaf7b // bfcvt z27.h, p3/M, z27.s\n" ".inst 0x658aaf5a // bfcvt z26.h, p3/M, z26.s\n" "st1h { z20.s }, p1, [x27, #1, MUL VL]\n" ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n" ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" "inch x10, ALL, MUL #3\n" "st1h { z19.s }, p0, [x27, #2, MUL VL]\n" ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" "inch x28, ALL, MUL #3\n" "st1h { z18.s }, p2, [x26]\n" ".inst 0x658aac35 // bfcvt z21.h, p3/M, z1.s\n" ".inst 0x658aac14 // bfcvt z20.h, p3/M, z0.s\n" "inch x27, ALL, MUL #3\n" "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n" ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z16.s }, p0, [x26, #2, MUL VL]\n" ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" "inch x26, ALL, MUL #3\n" "st1h { z27.s }, p2, [x25]\n" "st1h { z26.s }, p1, [x25, #1, MUL VL]\n" "st1h { z25.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "st1h { z24.s }, p2, [x24]\n" "st1h { z23.s }, p1, [x24, #1, MUL VL]\n" "st1h { z22.s }, p0, [x24, #2, MUL VL]\n" "inch x24, ALL, MUL #3\n" "st1h { z21.s }, p2, [x23]\n" "st1h { z20.s }, p1, [x23, #1, MUL VL]\n" "st1h { z19.s }, p0, [x23, #2, MUL VL]\n" "inch x23, ALL, MUL #3\n" "st1h { z18.s }, p2, [x22]\n" "st1h { z17.s }, p1, [x22, #1, MUL VL]\n" "st1h { z16.s }, p0, [x22, #2, MUL VL]\n" "inch x22, ALL, MUL #3\n" "bgt 31b\n" "subs %x[rows], %x[rows], #0x8\n" "add %x[out_ptr], %x[out_ptr], x12\n" "bgt 1b\n" "b 52f\n" "34:" // Accumulate "35:" // Accumulate: Row loop "cmp %x[rows], #0x7\n" "bgt 50f\n" "beq 48f\n" "cmp %x[rows], #0x5\n" "bgt 46f\n" "beq 44f\n" "cmp %x[rows], #0x3\n" "bgt 42f\n" "beq 40f\n" "cmp %x[rows], #0x1\n" "bgt 38f\n" "36:" // Accumulate: Height 1 "mov x11, %x[cols]\n" "mov x10, %x[out_ptr]\n" "37:" // Accumulate: Height 1: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z16.s }, p2/Z, [x10]\n" "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n" "lsl z16.s, z16.s, #0x10\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "fadd z19.s, z19.s, z16.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "whilelt p0.s, x21, x11\n" "decw x11, ALL, MUL #3\n" "incw x21\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "cmp x11, XZR\n" ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n" "st1h { z16.s }, p2, [x10]\n" "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "fadd z18.s, z18.s, z16.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "fadd z17.s, z17.s, z16.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" ".inst 0x658aae30 // bfcvt z16.h, p3/M, z17.s\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" "bgt 37b\n" "b 52f\n" "38:" // Accumulate: Height 2 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "39:" // Accumulate: Height 2: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z17.s }, p2/Z, [x10]\n" "ld1h { z16.s }, p2/Z, [x28]\n" "ld1w { z23.s }, p2/Z, [%x[in_ptr]]\n" "lsl z17.s, z17.s, #0x10\n" "ld1w { z22.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "fadd z23.s, z23.s, z17.s\n" "fadd z22.s, z22.s, z16.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "ld1w { z21.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "whilelt p0.s, x21, x11\n" "decw x11, ALL, MUL #3\n" "incw x21\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "cmp x11, XZR\n" ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" "st1h { z17.s }, p2, [x10]\n" "st1h { z16.s }, p2, [x28]\n" "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n" "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z21.s, z21.s, z17.s\n" "fadd z20.s, z20.s, z16.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z16.s }, p1, [x28, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z19.s, z19.s, z17.s\n" "fadd z18.s, z18.s, z16.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" "bgt 39b\n" "b 52f\n" "40:" // Accumulate: Height 3 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "41:" // Accumulate: Height 3: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z18.s }, p2/Z, [x10]\n" "ld1h { z17.s }, p2/Z, [x28]\n" "ld1h { z16.s }, p2/Z, [x27]\n" "ld1w { z26.s }, p2/Z, [%x[in_ptr]]\n" "lsl z19.s, z18.s, #0x10\n" "ld1w { z25.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "ld1w { z18.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "lsl z16.s, z16.s, #0x10\n" "fadd z26.s, z26.s, z19.s\n" "fadd z25.s, z25.s, z17.s\n" "ld1w { z24.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z22.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "fadd z18.s, z18.s, z16.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "whilelt p0.s, x21, x11\n" "decw x11, ALL, MUL #3\n" "incw x21\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n" "cmp x11, XZR\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" "st1h { z17.s }, p2, [x10]\n" "st1h { z16.s }, p2, [x28]\n" ".inst 0x658aae51 // bfcvt z17.h, p3/M, z18.s\n" "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" "st1h { z17.s }, p2, [x27]\n" "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n" "lsl z18.s, z16.s, #0x10\n" "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z24.s, z24.s, z18.s\n" "fadd z23.s, z23.s, z17.s\n" "fadd z22.s, z22.s, z16.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aaf10 // bfcvt z16.h, p3/M, z24.s\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n" ".inst 0x658aaed1 // bfcvt z17.h, p3/M, z22.s\n" "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z18.s }, p1, [x28, #1, MUL VL]\n" "st1h { z17.s }, p1, [x27, #1, MUL VL]\n" "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n" "lsl z18.s, z16.s, #0x10\n" "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z21.s, z21.s, z18.s\n" "fadd z20.s, z20.s, z17.s\n" "fadd z19.s, z19.s, z16.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" ".inst 0x658aae91 // bfcvt z17.h, p3/M, z20.s\n" ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n" "st1h { z17.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "bgt 41b\n" "b 52f\n" "42:" // Accumulate: Height 4 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "43:" // Accumulate: Height 4: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z19.s }, p2/Z, [x10]\n" "ld1h { z18.s }, p2/Z, [x28]\n" "ld1h { z17.s }, p2/Z, [x27]\n" "ld1h { z16.s }, p2/Z, [x26]\n" "ld1w { z30.s }, p2/Z, [%x[in_ptr]]\n" "lsl z20.s, z19.s, #0x10\n" "ld1w { z29.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "lsl z18.s, z18.s, #0x10\n" "ld1w { z28.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z19.s }, p2/Z, [x20, #-7, MUL VL]\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z30.s, z30.s, z20.s\n" "fadd z29.s, z29.s, z18.s\n" "ld1w { z27.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z26.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "ld1w { z25.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "ld1w { z24.s }, p1/Z, [x20, #-6, MUL VL]\n" "whilelt p0.s, x21, x11\n" "decw x11, ALL, MUL #3\n" "fadd z28.s, z28.s, z17.s\n" "fadd z19.s, z19.s, z16.s\n" "incw x21\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "ld1w { z23.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "ld1w { z21.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z20.s }, p0/Z, [x20, #-5, MUL VL]\n" "cmp x11, XZR\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z19.s, p3/M, z19.s, z12.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "fmax z19.s, p3/M, z19.s, z11.s\n" ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n" ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" "st1h { z18.s }, p2, [x10]\n" "st1h { z17.s }, p2, [x28]\n" ".inst 0x658aae71 // bfcvt z17.h, p3/M, z19.s\n" "st1h { z16.s }, p2, [x27]\n" "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" "st1h { z17.s }, p2, [x26]\n" "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n" "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n" "lsl z19.s, z16.s, #0x10\n" "ld1h { z16.s }, p1/Z, [x26, #1, MUL VL]\n" "lsl z18.s, z18.s, #0x10\n" "lsl z17.s, z17.s, #0x10\n" "fadd z27.s, z27.s, z19.s\n" "lsl z16.s, z16.s, #0x10\n" "fadd z26.s, z26.s, z18.s\n" "fadd z25.s, z25.s, z17.s\n" "fadd z24.s, z24.s, z16.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" ".inst 0x658aaf71 // bfcvt z17.h, p3/M, z27.s\n" ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n" "st1h { z17.s }, p1, [x10, #1, MUL VL]\n" "st1h { z16.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n" ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z18.s }, p1, [x27, #1, MUL VL]\n" "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n" "lsl z19.s, z16.s, #0x10\n" "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x26, #2, MUL VL]\n" "lsl z18.s, z18.s, #0x10\n" "lsl z17.s, z17.s, #0x10\n" "fadd z23.s, z23.s, z19.s\n" "lsl z16.s, z16.s, #0x10\n" "fadd z22.s, z22.s, z18.s\n" "fadd z21.s, z21.s, z17.s\n" "fadd z20.s, z20.s, z16.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z20.s, p3/M, z20.s, z11.s\n" ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" ".inst 0x658aaeb1 // bfcvt z17.h, p3/M, z21.s\n" ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" "st1h { z17.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "st1h { z16.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "bgt 43b\n" "b 52f\n" "44:" // Accumulate: Height 5 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "45:" // Accumulate: Height 5: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z20.s }, p2/Z, [x10]\n" "ld1h { z19.s }, p2/Z, [x28]\n" "ld1h { z18.s }, p2/Z, [x27]\n" "ld1h { z17.s }, p2/Z, [x26]\n" "ld1h { z16.s }, p2/Z, [x25]\n" "ld1w { z1.s }, p2/Z, [%x[in_ptr]]\n" "lsl z22.s, z20.s, #0x10\n" "ld1w { z0.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "lsl z21.s, z19.s, #0x10\n" "ld1w { z31.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "whilelt p1.s, x21, x11\n" "lsl z19.s, z18.s, #0x10\n" "ld1w { z20.s }, p2/Z, [x20, #-7, MUL VL]\n" "lsl z18.s, z17.s, #0x10\n" "ld1w { z17.s }, p2/Z, [x20, #-4, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "fadd z1.s, z1.s, z22.s\n" "incw x21\n" "fadd z0.s, z0.s, z21.s\n" "ld1w { z30.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z29.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "fadd z31.s, z31.s, z19.s\n" "fadd z20.s, z20.s, z18.s\n" "ld1w { z28.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "ld1w { z27.s }, p1/Z, [x20, #-6, MUL VL]\n" "fadd z17.s, z17.s, z16.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n" "whilelt p0.s, x21, x11\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmin z20.s, p3/M, z20.s, z12.s\n" "fmin z17.s, p3/M, z17.s, z12.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" "ld1w { z25.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z24.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "decw x11, ALL, MUL #3\n" "fmax z0.s, p3/M, z0.s, z11.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" "ld1w { z23.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n" "fmax z20.s, p3/M, z20.s, z11.s\n" "fmax z17.s, p3/M, z17.s, z11.s\n" "ld1w { z21.s }, p0/Z, [x20, #-2, MUL VL]\n" ".inst 0x658aac30 // bfcvt z16.h, p3/M, z1.s\n" "cmp x11, XZR\n" "incw x21\n" ".inst 0x658aac13 // bfcvt z19.h, p3/M, z0.s\n" ".inst 0x658aaff2 // bfcvt z18.h, p3/M, z31.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z16.s }, p2, [x10]\n" ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" "st1h { z19.s }, p2, [x28]\n" "st1h { z18.s }, p2, [x27]\n" "st1h { z16.s }, p2, [x26]\n" "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" "st1h { z17.s }, p2, [x25]\n" "ld1h { z19.s }, p1/Z, [x28, #1, MUL VL]\n" "ld1h { z18.s }, p1/Z, [x27, #1, MUL VL]\n" "ld1h { z17.s }, p1/Z, [x26, #1, MUL VL]\n" "lsl z20.s, z16.s, #0x10\n" "ld1h { z16.s }, p1/Z, [x25, #1, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z30.s, z30.s, z20.s\n" "fadd z29.s, z29.s, z19.s\n" "fadd z28.s, z28.s, z18.s\n" "fadd z27.s, z27.s, z17.s\n" "fadd z26.s, z26.s, z16.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n" ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" "st1h { z18.s }, p1, [x10, #1, MUL VL]\n" "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n" ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" "st1h { z16.s }, p1, [x27, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z18.s }, p1, [x26, #1, MUL VL]\n" "st1h { z17.s }, p1, [x25, #1, MUL VL]\n" "ld1h { z19.s }, p0/Z, [x28, #2, MUL VL]\n" "ld1h { z18.s }, p0/Z, [x27, #2, MUL VL]\n" "lsl z20.s, z16.s, #0x10\n" "ld1h { z17.s }, p0/Z, [x26, #2, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x25, #2, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "fadd z25.s, z25.s, z20.s\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z24.s, z24.s, z19.s\n" "fadd z23.s, z23.s, z18.s\n" "fadd z22.s, z22.s, z17.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fadd z21.s, z21.s, z16.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aaf31 // bfcvt z17.h, p3/M, z25.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" ".inst 0x658aaf10 // bfcvt z16.h, p3/M, z24.s\n" "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n" ".inst 0x658aaed1 // bfcvt z17.h, p3/M, z22.s\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n" "st1h { z18.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "st1h { z17.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z16.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "bgt 45b\n" "b 52f\n" "46:" // Accumulate: Height 6 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "47:" // Accumulate: Height 6: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z21.s }, p2/Z, [x10]\n" "ld1h { z20.s }, p2/Z, [x28]\n" "ld1h { z19.s }, p2/Z, [x27]\n" "ld1h { z18.s }, p2/Z, [x26]\n" "ld1h { z17.s }, p2/Z, [x25]\n" "ld1h { z16.s }, p2/Z, [x24]\n" "ld1w { z6.s }, p2/Z, [%x[in_ptr]]\n" "lsl z22.s, z21.s, #0x10\n" "ld1w { z5.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "lsl z21.s, z20.s, #0x10\n" "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "lsl z20.s, z19.s, #0x10\n" "ld1w { z3.s }, p2/Z, [x20, #-7, MUL VL]\n" "lsl z19.s, z18.s, #0x10\n" "ld1w { z2.s }, p2/Z, [x20, #-4, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "ld1w { z18.s }, p2/Z, [x20, #-1, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "fadd z6.s, z6.s, z22.s\n" "fadd z5.s, z5.s, z21.s\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "fadd z4.s, z4.s, z20.s\n" "fadd z3.s, z3.s, z19.s\n" "fadd z2.s, z2.s, z17.s\n" "fadd z18.s, z18.s, z16.s\n" "fmin z6.s, p3/M, z6.s, z12.s\n" "fmin z5.s, p3/M, z5.s, z12.s\n" "ld1w { z1.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z0.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "whilelt p0.s, x21, x11\n" "decw x11, ALL, MUL #3\n" "fmin z4.s, p3/M, z4.s, z12.s\n" "fmin z3.s, p3/M, z3.s, z12.s\n" "ld1w { z31.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "ld1w { z30.s }, p1/Z, [x20, #-6, MUL VL]\n" "fmin z2.s, p3/M, z2.s, z12.s\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "ld1w { z29.s }, p1/Z, [x20, #-3, MUL VL]\n" "ld1w { z28.s }, p1/Z, [x20]\n" "fmax z6.s, p3/M, z6.s, z11.s\n" "fmax z5.s, p3/M, z5.s, z11.s\n" "ld1w { z27.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z26.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "fmax z4.s, p3/M, z4.s, z11.s\n" "fmax z3.s, p3/M, z3.s, z11.s\n" "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n" "ld1w { z24.s }, p0/Z, [x20, #-5, MUL VL]\n" "fmax z2.s, p3/M, z2.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "ld1w { z23.s }, p0/Z, [x20, #-2, MUL VL]\n" "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n" ".inst 0x658aacd5 // bfcvt z21.h, p3/M, z6.s\n" ".inst 0x658aacb4 // bfcvt z20.h, p3/M, z5.s\n" "cmp x11, XZR\n" "incw x21\n" ".inst 0x658aac93 // bfcvt z19.h, p3/M, z4.s\n" ".inst 0x658aac71 // bfcvt z17.h, p3/M, z3.s\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" ".inst 0x658aac50 // bfcvt z16.h, p3/M, z2.s\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" "st1h { z21.s }, p2, [x10]\n" "st1h { z20.s }, p2, [x28]\n" "st1h { z19.s }, p2, [x27]\n" "st1h { z17.s }, p2, [x26]\n" "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n" "st1h { z16.s }, p2, [x25]\n" "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n" "st1h { z18.s }, p2, [x24]\n" "ld1h { z19.s }, p1/Z, [x27, #1, MUL VL]\n" "ld1h { z18.s }, p1/Z, [x26, #1, MUL VL]\n" "lsl z21.s, z17.s, #0x10\n" "ld1h { z17.s }, p1/Z, [x25, #1, MUL VL]\n" "lsl z20.s, z16.s, #0x10\n" "ld1h { z16.s }, p1/Z, [x24, #1, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "fadd z1.s, z1.s, z21.s\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z0.s, z0.s, z20.s\n" "fadd z31.s, z31.s, z19.s\n" "fadd z30.s, z30.s, z18.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "fadd z29.s, z29.s, z17.s\n" "fadd z28.s, z28.s, z16.s\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmax z0.s, p3/M, z0.s, z11.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" ".inst 0x658aac34 // bfcvt z20.h, p3/M, z1.s\n" ".inst 0x658aac12 // bfcvt z18.h, p3/M, z0.s\n" ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n" ".inst 0x658aafd1 // bfcvt z17.h, p3/M, z30.s\n" ".inst 0x658aafb0 // bfcvt z16.h, p3/M, z29.s\n" "st1h { z20.s }, p1, [x10, #1, MUL VL]\n" "st1h { z18.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aaf92 // bfcvt z18.h, p3/M, z28.s\n" "st1h { z19.s }, p1, [x27, #1, MUL VL]\n" "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z16.s }, p1, [x25, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n" "st1h { z18.s }, p1, [x24, #1, MUL VL]\n" "ld1h { z19.s }, p0/Z, [x27, #2, MUL VL]\n" "ld1h { z18.s }, p0/Z, [x26, #2, MUL VL]\n" "lsl z21.s, z17.s, #0x10\n" "ld1h { z17.s }, p0/Z, [x25, #2, MUL VL]\n" "lsl z20.s, z16.s, #0x10\n" "ld1h { z16.s }, p0/Z, [x24, #2, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "fadd z27.s, z27.s, z21.s\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z26.s, z26.s, z20.s\n" "fadd z25.s, z25.s, z19.s\n" "fadd z24.s, z24.s, z18.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fadd z23.s, z23.s, z17.s\n" "fadd z22.s, z22.s, z16.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aaf74 // bfcvt z20.h, p3/M, z27.s\n" ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n" ".inst 0x658aaf33 // bfcvt z19.h, p3/M, z25.s\n" ".inst 0x658aaf12 // bfcvt z18.h, p3/M, z24.s\n" ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" "st1h { z20.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" "inch x28, ALL, MUL #3\n" "st1h { z19.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" "st1h { z18.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z17.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "st1h { z16.s }, p0, [x24, #2, MUL VL]\n" "inch x24, ALL, MUL #3\n" "bgt 47b\n" "b 52f\n" "48:" // Accumulate: Height 7 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "add x23, x24, %x[ldout], LSL #1\n" "49:" // Accumulate: Height 7: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z22.s }, p2/Z, [x10]\n" "ld1h { z21.s }, p2/Z, [x28]\n" "ld1h { z20.s }, p2/Z, [x27]\n" "ld1h { z19.s }, p2/Z, [x26]\n" "ld1h { z18.s }, p2/Z, [x25]\n" "ld1h { z17.s }, p2/Z, [x24]\n" "ld1h { z16.s }, p2/Z, [x23]\n" "ld1w { z8.s }, p2/Z, [%x[in_ptr]]\n" "lsl z25.s, z22.s, #0x10\n" "lsl z24.s, z21.s, #0x10\n" "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "lsl z20.s, z20.s, #0x10\n" "lsl z19.s, z19.s, #0x10\n" "ld1w { z23.s }, p2/Z, [x20, #-7, MUL VL]\n" "ld1w { z6.s }, p2/Z, [x20, #-4, MUL VL]\n" "lsl z18.s, z18.s, #0x10\n" "lsl z17.s, z17.s, #0x10\n" "ld1w { z5.s }, p2/Z, [x20, #-1, MUL VL]\n" "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "fadd z8.s, z8.s, z25.s\n" "fadd z21.s, z21.s, z24.s\n" "fadd z7.s, z7.s, z20.s\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "fadd z23.s, z23.s, z19.s\n" "fadd z6.s, z6.s, z18.s\n" "fadd z5.s, z5.s, z17.s\n" "fadd z22.s, z22.s, z16.s\n" "fmin z8.s, p3/M, z8.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z7.s, p3/M, z7.s, z12.s\n" "ld1w { z4.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z3.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "whilelt p0.s, x21, x11\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z6.s, p3/M, z6.s, z12.s\n" "ld1w { z2.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "ld1w { z1.s }, p1/Z, [x20, #-6, MUL VL]\n" "fmin z5.s, p3/M, z5.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "ld1w { z0.s }, p1/Z, [x20, #-3, MUL VL]\n" "ld1w { z31.s }, p1/Z, [x20]\n" "fmax z8.s, p3/M, z8.s, z11.s\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "ld1w { z30.s }, p1/Z, [x20, #3, MUL VL]\n" "ld1w { z29.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "fmax z7.s, p3/M, z7.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "ld1w { z28.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "ld1w { z27.s }, p0/Z, [x20, #-8, MUL VL]\n" "fmax z6.s, p3/M, z6.s, z11.s\n" "fmax z5.s, p3/M, z5.s, z11.s\n" "ld1w { z26.s }, p0/Z, [x20, #-5, MUL VL]\n" "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aad13 // bfcvt z19.h, p3/M, z8.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" "ld1w { z24.s }, p0/Z, [x20, #1, MUL VL]\n" ".inst 0x658aacf4 // bfcvt z20.h, p3/M, z7.s\n" ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n" "ld1w { z23.s }, p0/Z, [x20, #4, MUL VL]\n" "decw x11, ALL, MUL #3\n" ".inst 0x658aacd1 // bfcvt z17.h, p3/M, z6.s\n" ".inst 0x658aacb0 // bfcvt z16.h, p3/M, z5.s\n" "incw x21\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z19.s }, p2, [x10]\n" ".inst 0x658aaed3 // bfcvt z19.h, p3/M, z22.s\n" "st1h { z21.s }, p2, [x28]\n" "cmp x11, XZR\n" "st1h { z20.s }, p2, [x27]\n" "st1h { z18.s }, p2, [x26]\n" "ld1h { z18.s }, p1/Z, [x10, #1, MUL VL]\n" "st1h { z17.s }, p2, [x25]\n" "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n" "st1h { z16.s }, p2, [x24]\n" "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n" "st1h { z19.s }, p2, [x23]\n" "ld1h { z19.s }, p1/Z, [x26, #1, MUL VL]\n" "lsl z22.s, z18.s, #0x10\n" "ld1h { z18.s }, p1/Z, [x25, #1, MUL VL]\n" "lsl z21.s, z17.s, #0x10\n" "ld1h { z17.s }, p1/Z, [x24, #1, MUL VL]\n" "lsl z20.s, z16.s, #0x10\n" "ld1h { z16.s }, p1/Z, [x23, #1, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "fadd z4.s, z4.s, z22.s\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z3.s, z3.s, z21.s\n" "fadd z2.s, z2.s, z20.s\n" "fadd z1.s, z1.s, z19.s\n" "fadd z0.s, z0.s, z18.s\n" "fadd z31.s, z31.s, z17.s\n" "fmin z4.s, p3/M, z4.s, z12.s\n" "fadd z30.s, z30.s, z16.s\n" "fmin z3.s, p3/M, z3.s, z12.s\n" "fmin z2.s, p3/M, z2.s, z12.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmax z4.s, p3/M, z4.s, z11.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fmax z3.s, p3/M, z3.s, z11.s\n" "fmax z2.s, p3/M, z2.s, z11.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" "fmax z0.s, p3/M, z0.s, z11.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" ".inst 0x658aac90 // bfcvt z16.h, p3/M, z4.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" ".inst 0x658aac74 // bfcvt z20.h, p3/M, z3.s\n" ".inst 0x658aac53 // bfcvt z19.h, p3/M, z2.s\n" ".inst 0x658aac32 // bfcvt z18.h, p3/M, z1.s\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aac11 // bfcvt z17.h, p3/M, z0.s\n" ".inst 0x658aaff0 // bfcvt z16.h, p3/M, z31.s\n" "st1h { z20.s }, p1, [x28, #1, MUL VL]\n" "st1h { z19.s }, p1, [x27, #1, MUL VL]\n" ".inst 0x658aafd3 // bfcvt z19.h, p3/M, z30.s\n" "st1h { z18.s }, p1, [x26, #1, MUL VL]\n" "ld1h { z18.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z17.s }, p1, [x25, #1, MUL VL]\n" "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n" "st1h { z16.s }, p1, [x24, #1, MUL VL]\n" "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n" "st1h { z19.s }, p1, [x23, #1, MUL VL]\n" "ld1h { z19.s }, p0/Z, [x26, #2, MUL VL]\n" "lsl z22.s, z18.s, #0x10\n" "ld1h { z18.s }, p0/Z, [x25, #2, MUL VL]\n" "lsl z21.s, z17.s, #0x10\n" "ld1h { z17.s }, p0/Z, [x24, #2, MUL VL]\n" "lsl z20.s, z16.s, #0x10\n" "ld1h { z16.s }, p0/Z, [x23, #2, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "fadd z29.s, z29.s, z22.s\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fadd z28.s, z28.s, z21.s\n" "fadd z27.s, z27.s, z20.s\n" "fadd z26.s, z26.s, z19.s\n" "fadd z25.s, z25.s, z18.s\n" "fadd z24.s, z24.s, z17.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fadd z23.s, z23.s, z16.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" ".inst 0x658aaf94 // bfcvt z20.h, p3/M, z28.s\n" ".inst 0x658aaf70 // bfcvt z16.h, p3/M, z27.s\n" ".inst 0x658aaf53 // bfcvt z19.h, p3/M, z26.s\n" "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" "inch x10, ALL, MUL #3\n" ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n" ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" "st1h { z20.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" ".inst 0x658aaef0 // bfcvt z16.h, p3/M, z23.s\n" "inch x27, ALL, MUL #3\n" "st1h { z19.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z18.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "st1h { z17.s }, p0, [x24, #2, MUL VL]\n" "inch x24, ALL, MUL #3\n" "st1h { z16.s }, p0, [x23, #2, MUL VL]\n" "inch x23, ALL, MUL #3\n" "bgt 49b\n" "b 52f\n" "50:" // Accumulate: Height 8 "mov x10, %x[out_ptr]\n" "mov x11, %x[cols]\n" "add x28, x10, %x[ldout], LSL #1\n" "add x27, x28, %x[ldout], LSL #1\n" "add x26, x27, %x[ldout], LSL #1\n" "add x25, x26, %x[ldout], LSL #1\n" "add x24, x25, %x[ldout], LSL #1\n" "add x23, x24, %x[ldout], LSL #1\n" "add x22, x23, %x[ldout], LSL #1\n" "51:" // Accumulate: Height 8: Block loop "mov x21, #0x0\n" "addvl x20, %x[in_ptr], #16\n" "whilelt p2.s, x21, x11\n" "incw x21\n" "ld1h { z23.s }, p2/Z, [x10]\n" "ld1h { z22.s }, p2/Z, [x28]\n" "ld1h { z21.s }, p2/Z, [x27]\n" "ld1h { z20.s }, p2/Z, [x26]\n" "ld1h { z19.s }, p2/Z, [x25]\n" "ld1h { z18.s }, p2/Z, [x24]\n" "ld1h { z17.s }, p2/Z, [x23]\n" "ld1h { z16.s }, p2/Z, [x22]\n" "lsl z31.s, z23.s, #0x10\n" "lsl z30.s, z22.s, #0x10\n" "ld1w { z29.s }, p2/Z, [%x[in_ptr]]\n" "ld1w { z28.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" "lsl z27.s, z21.s, #0x10\n" "lsl z26.s, z20.s, #0x10\n" "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" "ld1w { z25.s }, p2/Z, [x20, #-7, MUL VL]\n" "lsl z20.s, z19.s, #0x10\n" "lsl z19.s, z18.s, #0x10\n" "ld1w { z18.s }, p2/Z, [x20, #-4, MUL VL]\n" "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "ld1w { z23.s }, p2/Z, [x20, #2, MUL VL]\n" "ld1w { z22.s }, p2/Z, [x20, #5, MUL VL]\n" "fadd z29.s, z29.s, z31.s\n" "fadd z28.s, z28.s, z30.s\n" "fadd z21.s, z21.s, z27.s\n" "fadd z25.s, z25.s, z26.s\n" "whilelt p1.s, x21, x11\n" "incw x21\n" "fadd z18.s, z18.s, z20.s\n" "fadd z24.s, z24.s, z19.s\n" "fadd z23.s, z23.s, z17.s\n" "fadd z22.s, z22.s, z16.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fmin z21.s, p3/M, z21.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "ld1w { z6.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" "ld1w { z5.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" "fmin z18.s, p3/M, z18.s, z12.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "ld1w { z4.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmin z22.s, p3/M, z22.s, z12.s\n" "ld1w { z2.s }, p1/Z, [x20, #-3, MUL VL]\n" "ld1w { z1.s }, p1/Z, [x20]\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" "ld1w { z31.s }, p1/Z, [x20, #6, MUL VL]\n" "fmax z21.s, p3/M, z21.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" "fmax z18.s, p3/M, z18.s, z11.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "fmax z22.s, p3/M, z22.s, z11.s\n" ".inst 0x658aafb4 // bfcvt z20.h, p3/M, z29.s\n" ".inst 0x658aaf93 // bfcvt z19.h, p3/M, z28.s\n" ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" "whilelt p0.s, x21, x11\n" "decw x11, ALL, MUL #3\n" ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" "incw x21\n" "st1h { z20.s }, p2, [x10]\n" "st1h { z19.s }, p2, [x28]\n" ".inst 0x658aaef4 // bfcvt z20.h, p3/M, z23.s\n" ".inst 0x658aaed3 // bfcvt z19.h, p3/M, z22.s\n" "st1h { z21.s }, p2, [x27]\n" "ld1w { z30.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" "ld1w { z29.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" "cmp x11, XZR\n" "st1h { z16.s }, p2, [x26]\n" "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" "ld1w { z28.s }, p0/Z, [x20, #-8, MUL VL]\n" "addvl %x[in_ptr], %x[in_ptr], #24\n" "st1h { z18.s }, p2, [x25]\n" "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n" "ld1w { z27.s }, p0/Z, [x20, #-5, MUL VL]\n" "st1h { z17.s }, p2, [x24]\n" "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n" "ld1w { z26.s }, p0/Z, [x20, #-2, MUL VL]\n" "st1h { z20.s }, p2, [x23]\n" "ld1h { z20.s }, p1/Z, [x26, #1, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "ld1w { z25.s }, p0/Z, [x20, #1, MUL VL]\n" "st1h { z19.s }, p2, [x22]\n" "ld1h { z19.s }, p1/Z, [x25, #1, MUL VL]\n" "lsl z22.s, z18.s, #0x10\n" "ld1w { z24.s }, p0/Z, [x20, #4, MUL VL]\n" "ld1h { z18.s }, p1/Z, [x24, #1, MUL VL]\n" "lsl z21.s, z17.s, #0x10\n" "ld1w { z23.s }, p0/Z, [x20, #7, MUL VL]\n" "ld1h { z17.s }, p1/Z, [x23, #1, MUL VL]\n" "lsl z20.s, z20.s, #0x10\n" "fadd z6.s, z6.s, z16.s\n" "ld1h { z16.s }, p1/Z, [x22, #1, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "fadd z5.s, z5.s, z22.s\n" "lsl z18.s, z18.s, #0x10\n" "fadd z4.s, z4.s, z21.s\n" "lsl z17.s, z17.s, #0x10\n" "lsl z16.s, z16.s, #0x10\n" "fmin z6.s, p3/M, z6.s, z12.s\n" "fadd z3.s, z3.s, z20.s\n" "fadd z2.s, z2.s, z19.s\n" "fmin z5.s, p3/M, z5.s, z12.s\n" "fadd z1.s, z1.s, z18.s\n" "fmin z4.s, p3/M, z4.s, z12.s\n" "fadd z0.s, z0.s, z17.s\n" "fadd z31.s, z31.s, z16.s\n" "fmax z6.s, p3/M, z6.s, z11.s\n" "fmin z3.s, p3/M, z3.s, z12.s\n" "fmin z2.s, p3/M, z2.s, z12.s\n" "fmax z5.s, p3/M, z5.s, z11.s\n" "fmin z1.s, p3/M, z1.s, z12.s\n" "fmin z0.s, p3/M, z0.s, z12.s\n" "fmin z31.s, p3/M, z31.s, z12.s\n" "fmax z4.s, p3/M, z4.s, z11.s\n" ".inst 0x658aacd0 // bfcvt z16.h, p3/M, z6.s\n" "fmax z3.s, p3/M, z3.s, z11.s\n" "fmax z2.s, p3/M, z2.s, z11.s\n" ".inst 0x658aacb1 // bfcvt z17.h, p3/M, z5.s\n" "fmax z1.s, p3/M, z1.s, z11.s\n" "fmax z0.s, p3/M, z0.s, z11.s\n" "fmax z31.s, p3/M, z31.s, z11.s\n" "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" ".inst 0x658aac90 // bfcvt z16.h, p3/M, z4.s\n" "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" ".inst 0x658aac75 // bfcvt z21.h, p3/M, z3.s\n" ".inst 0x658aac52 // bfcvt z18.h, p3/M, z2.s\n" ".inst 0x658aac31 // bfcvt z17.h, p3/M, z1.s\n" ".inst 0x658aac14 // bfcvt z20.h, p3/M, z0.s\n" "st1h { z16.s }, p1, [x27, #1, MUL VL]\n" ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n" "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" "st1h { z21.s }, p1, [x26, #1, MUL VL]\n" "st1h { z18.s }, p1, [x25, #1, MUL VL]\n" "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n" "st1h { z17.s }, p1, [x24, #1, MUL VL]\n" "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n" "st1h { z20.s }, p1, [x23, #1, MUL VL]\n" "ld1h { z20.s }, p0/Z, [x26, #2, MUL VL]\n" "lsl z16.s, z16.s, #0x10\n" "st1h { z19.s }, p1, [x22, #1, MUL VL]\n" "ld1h { z19.s }, p0/Z, [x25, #2, MUL VL]\n" "lsl z22.s, z18.s, #0x10\n" "ld1h { z18.s }, p0/Z, [x24, #2, MUL VL]\n" "lsl z21.s, z17.s, #0x10\n" "ld1h { z17.s }, p0/Z, [x23, #2, MUL VL]\n" "lsl z20.s, z20.s, #0x10\n" "fadd z30.s, z30.s, z16.s\n" "ld1h { z16.s }, p0/Z, [x22, #2, MUL VL]\n" "lsl z19.s, z19.s, #0x10\n" "lsl z18.s, z18.s, #0x10\n" "fadd z29.s, z29.s, z22.s\n" "lsl z17.s, z17.s, #0x10\n" "fadd z28.s, z28.s, z21.s\n" "lsl z16.s, z16.s, #0x10\n" "fadd z27.s, z27.s, z20.s\n" "fmin z30.s, p3/M, z30.s, z12.s\n" "fadd z26.s, z26.s, z19.s\n" "fadd z25.s, z25.s, z18.s\n" "fmin z29.s, p3/M, z29.s, z12.s\n" "fadd z24.s, z24.s, z17.s\n" "fmin z28.s, p3/M, z28.s, z12.s\n" "fadd z23.s, z23.s, z16.s\n" "fmin z27.s, p3/M, z27.s, z12.s\n" "fmax z30.s, p3/M, z30.s, z11.s\n" "fmin z26.s, p3/M, z26.s, z12.s\n" "fmin z25.s, p3/M, z25.s, z12.s\n" "fmax z29.s, p3/M, z29.s, z11.s\n" "fmin z24.s, p3/M, z24.s, z12.s\n" "fmin z23.s, p3/M, z23.s, z12.s\n" "fmax z28.s, p3/M, z28.s, z11.s\n" "fmax z27.s, p3/M, z27.s, z11.s\n" ".inst 0x658aafd0 // bfcvt z16.h, p3/M, z30.s\n" "fmax z26.s, p3/M, z26.s, z11.s\n" "fmax z25.s, p3/M, z25.s, z11.s\n" ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" "fmax z24.s, p3/M, z24.s, z11.s\n" "fmax z23.s, p3/M, z23.s, z11.s\n" "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" ".inst 0x658aaf74 // bfcvt z20.h, p3/M, z27.s\n" "inch x10, ALL, MUL #3\n" "st1h { z17.s }, p0, [x28, #2, MUL VL]\n" "inch x28, ALL, MUL #3\n" ".inst 0x658aaf53 // bfcvt z19.h, p3/M, z26.s\n" ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n" "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" "inch x27, ALL, MUL #3\n" ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" ".inst 0x658aaef0 // bfcvt z16.h, p3/M, z23.s\n" "st1h { z20.s }, p0, [x26, #2, MUL VL]\n" "inch x26, ALL, MUL #3\n" "st1h { z19.s }, p0, [x25, #2, MUL VL]\n" "inch x25, ALL, MUL #3\n" "st1h { z18.s }, p0, [x24, #2, MUL VL]\n" "inch x24, ALL, MUL #3\n" "st1h { z17.s }, p0, [x23, #2, MUL VL]\n" "inch x23, ALL, MUL #3\n" "st1h { z16.s }, p0, [x22, #2, MUL VL]\n" "inch x22, ALL, MUL #3\n" "bgt 51b\n" "subs %x[rows], %x[rows], #0x8\n" "add %x[out_ptr], %x[out_ptr], x12\n" "bgt 35b\n" "52:" // Exit : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows) : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval) : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } #endif // ARM_COMPUTE_ENABLE_SVE