aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp2137
1 files changed, 2137 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp
new file mode 100644
index 0000000000..5d4a8bf347
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_bf16_8x3VL.hpp
@@ -0,0 +1,2137 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+template<>
+void MergeResults<3, 8, true>(
+ bfloat16 *out_ptr,
+ const float * in_ptr,
+ const int ldout,
+ const int y0, const int ymax,
+ const int x0, const int xmax,
+ const bfloat16 *bias,
+ Activation act,
+ bool accumulate)
+{
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ minval = 0;
+ break;
+ }
+
+ size_t rows = ymax-y0;
+ size_t cols = xmax-x0;
+
+ out_ptr += (y0 * ldout) + x0;
+ bias = (bias == nullptr) ? nullptr : bias + x0;
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "cbz %x[cols], 52f\n"
+ "cbz %x[rows], 52f\n"
+ "mov x12, #0x20\n"
+ "dup z12.s, %w[maxval]\n"
+ "dup z11.s, %w[minval]\n"
+ "mul x12, %x[ldout], x12\n"
+ "cbnz %x[accumulate], 34f\n"
+ "1:" // Initial: Row loop
+ "cmp %x[rows], #0x7\n"
+ "bgt 30f\n"
+ "beq 26f\n"
+ "cmp %x[rows], #0x5\n"
+ "bgt 22f\n"
+ "beq 18f\n"
+ "cmp %x[rows], #0x3\n"
+ "bgt 14f\n"
+ "beq 10f\n"
+ "cmp %x[rows], #0x1\n"
+ "bgt 6f\n"
+ "2:" // Initial: Height 1
+ "mov x11, %x[cols]\n"
+ "mov x10, %x[out_ptr]\n"
+ "mov x9, %x[bias]\n"
+ "3:" // Initial: Height 1: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 4f\n"
+ "mov z21.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "b 5f\n"
+ "4:" // Initial: Height 1: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z21.s, z18.s, #0x10\n"
+ "lsl z20.s, z17.s, #0x10\n"
+ "lsl z19.s, z16.s, #0x10\n"
+ "5:" // Initial: Height 1: Width 3: init done
+ "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z18.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "fadd z17.s, z17.s, z21.s\n"
+ "fadd z16.s, z16.s, z20.s\n"
+ "cmp x11, XZR\n"
+ "fadd z18.s, z18.s, z19.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ "st1h { z17.s }, p2, [x10]\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ "bgt 3b\n"
+ "b 52f\n"
+ "6:" // Initial: Height 2
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "7:" // Initial: Height 2: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 8f\n"
+ "mov z24.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "b 9f\n"
+ "8:" // Initial: Height 2: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z24.s, z18.s, #0x10\n"
+ "lsl z23.s, z17.s, #0x10\n"
+ "lsl z22.s, z16.s, #0x10\n"
+ "9:" // Initial: Height 2: Width 3: init done
+ "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "fadd z17.s, z17.s, z24.s\n"
+ "fadd z16.s, z16.s, z23.s\n"
+ "cmp x11, XZR\n"
+ "fadd z19.s, z19.s, z22.s\n"
+ "fadd z18.s, z18.s, z24.s\n"
+ "fadd z21.s, z21.s, z23.s\n"
+ "fadd z20.s, z20.s, z22.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "st1h { z17.s }, p2, [x10]\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aaeb1 // bfcvt z17.h, p3/M, z21.s\n"
+ ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n"
+ "st1h { z19.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x28]\n"
+ "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ "bgt 7b\n"
+ "b 52f\n"
+ "10:" // Initial: Height 3
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "11:" // Initial: Height 3: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 12f\n"
+ "mov z27.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "b 13f\n"
+ "12:" // Initial: Height 3: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z27.s, z18.s, #0x10\n"
+ "lsl z26.s, z17.s, #0x10\n"
+ "lsl z25.s, z16.s, #0x10\n"
+ "13:" // Initial: Height 3: Width 3: init done
+ "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z19.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z18.s, z18.s, z27.s\n"
+ "fadd z17.s, z17.s, z26.s\n"
+ "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "fadd z16.s, z16.s, z25.s\n"
+ "fadd z21.s, z21.s, z27.s\n"
+ "cmp x11, XZR\n"
+ "fadd z20.s, z20.s, z26.s\n"
+ "fadd z19.s, z19.s, z25.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "fadd z24.s, z24.s, z27.s\n"
+ "fadd z23.s, z23.s, z26.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fadd z22.s, z22.s, z25.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n"
+ "st1h { z18.s }, p2, [x10]\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aaf12 // bfcvt z18.h, p3/M, z24.s\n"
+ "st1h { z17.s }, p1, [x10, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n"
+ ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z21.s }, p2, [x28]\n"
+ "st1h { z20.s }, p1, [x28, #1, MUL VL]\n"
+ "st1h { z19.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x27]\n"
+ "st1h { z17.s }, p1, [x27, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "bgt 11b\n"
+ "b 52f\n"
+ "14:" // Initial: Height 4
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "15:" // Initial: Height 4: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 16f\n"
+ "mov z30.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "b 17f\n"
+ "16:" // Initial: Height 4: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z30.s, z18.s, #0x10\n"
+ "lsl z29.s, z17.s, #0x10\n"
+ "lsl z28.s, z16.s, #0x10\n"
+ "17:" // Initial: Height 4: Width 3: init done
+ "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z20.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z18.s, z18.s, z30.s\n"
+ "fadd z17.s, z17.s, z29.s\n"
+ "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "fadd z16.s, z16.s, z28.s\n"
+ "fadd z24.s, z24.s, z30.s\n"
+ "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fadd z23.s, z23.s, z29.s\n"
+ "fadd z22.s, z22.s, z28.s\n"
+ "fadd z21.s, z21.s, z30.s\n"
+ "fadd z20.s, z20.s, z29.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fadd z19.s, z19.s, z28.s\n"
+ "fadd z27.s, z27.s, z30.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fadd z26.s, z26.s, z29.s\n"
+ "fadd z25.s, z25.s, z28.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n"
+ ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n"
+ "cmp x11, XZR\n"
+ "st1h { z18.s }, p2, [x10]\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n"
+ "st1h { z17.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n"
+ ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z24.s }, p2, [x28]\n"
+ "st1h { z23.s }, p1, [x28, #1, MUL VL]\n"
+ "st1h { z22.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z21.s }, p2, [x27]\n"
+ "st1h { z20.s }, p1, [x27, #1, MUL VL]\n"
+ "st1h { z19.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x26]\n"
+ "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "bgt 15b\n"
+ "b 52f\n"
+ "18:" // Initial: Height 5
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "19:" // Initial: Height 5: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 20f\n"
+ "mov z1.b, #0x0\n"
+ "mov z0.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "b 21f\n"
+ "20:" // Initial: Height 5: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z1.s, z18.s, #0x10\n"
+ "lsl z0.s, z17.s, #0x10\n"
+ "lsl z31.s, z16.s, #0x10\n"
+ "21:" // Initial: Height 5: Width 3: init done
+ "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z21.s, z21.s, z1.s\n"
+ "fadd z20.s, z20.s, z0.s\n"
+ "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "fadd z19.s, z19.s, z31.s\n"
+ "fadd z18.s, z18.s, z1.s\n"
+ "ld1w { z29.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "ld1w { z28.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fadd z17.s, z17.s, z0.s\n"
+ "fadd z16.s, z16.s, z31.s\n"
+ "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "fadd z24.s, z24.s, z1.s\n"
+ "fadd z23.s, z23.s, z0.s\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "fadd z22.s, z22.s, z31.s\n"
+ "fadd z30.s, z30.s, z1.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fadd z29.s, z29.s, z0.s\n"
+ "fadd z28.s, z28.s, z31.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fadd z27.s, z27.s, z1.s\n"
+ "fadd z26.s, z26.s, z0.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fadd z25.s, z25.s, z31.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "st1h { z21.s }, p2, [x10]\n"
+ ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "cmp x11, XZR\n"
+ "st1h { z20.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n"
+ "st1h { z19.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n"
+ ".inst 0x658aafd5 // bfcvt z21.h, p3/M, z30.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x28]\n"
+ ".inst 0x658aafb4 // bfcvt z20.h, p3/M, z29.s\n"
+ ".inst 0x658aaf93 // bfcvt z19.h, p3/M, z28.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n"
+ ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z24.s }, p2, [x27]\n"
+ "st1h { z23.s }, p1, [x27, #1, MUL VL]\n"
+ "st1h { z22.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z21.s }, p2, [x26]\n"
+ "st1h { z20.s }, p1, [x26, #1, MUL VL]\n"
+ "st1h { z19.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x25]\n"
+ "st1h { z17.s }, p1, [x25, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "bgt 19b\n"
+ "b 52f\n"
+ "22:" // Initial: Height 6
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "add x24, x25, %x[ldout], LSL #1\n"
+ "23:" // Initial: Height 6: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 24f\n"
+ "mov z4.b, #0x0\n"
+ "mov z3.b, #0x0\n"
+ "mov z2.b, #0x0\n"
+ "b 25f\n"
+ "24:" // Initial: Height 6: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z4.s, z18.s, #0x10\n"
+ "lsl z3.s, z17.s, #0x10\n"
+ "lsl z2.s, z16.s, #0x10\n"
+ "25:" // Initial: Height 6: Width 3: init done
+ "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z17.s, z17.s, z4.s\n"
+ "fadd z16.s, z16.s, z3.s\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "fadd z21.s, z21.s, z2.s\n"
+ "fadd z20.s, z20.s, z4.s\n"
+ "ld1w { z23.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fadd z19.s, z19.s, z3.s\n"
+ "fadd z18.s, z18.s, z2.s\n"
+ "ld1w { z31.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "fadd z1.s, z1.s, z4.s\n"
+ "fadd z0.s, z0.s, z3.s\n"
+ "ld1w { z29.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x20, #-1, MUL VL]\n"
+ "fadd z25.s, z25.s, z2.s\n"
+ "fadd z24.s, z24.s, z4.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "ld1w { z26.s }, p0/Z, [x20, #1, MUL VL]\n"
+ "fadd z23.s, z23.s, z3.s\n"
+ "fadd z22.s, z22.s, z2.s\n"
+ "fadd z31.s, z31.s, z4.s\n"
+ "fadd z30.s, z30.s, z3.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fadd z29.s, z29.s, z2.s\n"
+ "fadd z28.s, z28.s, z4.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fadd z27.s, z27.s, z3.s\n"
+ "fadd z26.s, z26.s, z2.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "st1h { z17.s }, p2, [x10]\n"
+ ".inst 0x658aac31 // bfcvt z17.h, p3/M, z1.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aac10 // bfcvt z16.h, p3/M, z0.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "st1h { z21.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n"
+ "cmp x11, XZR\n"
+ "st1h { z20.s }, p2, [x28]\n"
+ ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n"
+ ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n"
+ "st1h { z19.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n"
+ ".inst 0x658aaff5 // bfcvt z21.h, p3/M, z31.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z18.s }, p0, [x28, #2, MUL VL]\n"
+ ".inst 0x658aafd4 // bfcvt z20.h, p3/M, z30.s\n"
+ ".inst 0x658aafb3 // bfcvt z19.h, p3/M, z29.s\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z17.s }, p2, [x27]\n"
+ ".inst 0x658aaf92 // bfcvt z18.h, p3/M, z28.s\n"
+ ".inst 0x658aaf71 // bfcvt z17.h, p3/M, z27.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z16.s }, p1, [x27, #1, MUL VL]\n"
+ ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n"
+ "st1h { z25.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z24.s }, p2, [x26]\n"
+ "st1h { z23.s }, p1, [x26, #1, MUL VL]\n"
+ "st1h { z22.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z21.s }, p2, [x25]\n"
+ "st1h { z20.s }, p1, [x25, #1, MUL VL]\n"
+ "st1h { z19.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x24]\n"
+ "st1h { z17.s }, p1, [x24, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x24, #2, MUL VL]\n"
+ "inch x24, ALL, MUL #3\n"
+ "bgt 23b\n"
+ "b 52f\n"
+ "26:" // Initial: Height 7
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "add x24, x25, %x[ldout], LSL #1\n"
+ "add x23, x24, %x[ldout], LSL #1\n"
+ "27:" // Initial: Height 7: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 28f\n"
+ "mov z7.b, #0x0\n"
+ "mov z6.b, #0x0\n"
+ "mov z5.b, #0x0\n"
+ "b 29f\n"
+ "28:" // Initial: Height 7: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z7.s, z18.s, #0x10\n"
+ "lsl z6.s, z17.s, #0x10\n"
+ "lsl z5.s, z16.s, #0x10\n"
+ "29:" // Initial: Height 7: Width 3: init done
+ "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z3.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z19.s, z19.s, z7.s\n"
+ "fadd z18.s, z18.s, z6.s\n"
+ "ld1w { z2.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "fadd z17.s, z17.s, z5.s\n"
+ "fadd z16.s, z16.s, z7.s\n"
+ "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fadd z21.s, z21.s, z6.s\n"
+ "fadd z20.s, z20.s, z5.s\n"
+ "ld1w { z24.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "fadd z4.s, z4.s, z7.s\n"
+ "fadd z3.s, z3.s, z6.s\n"
+ "ld1w { z22.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "ld1w { z0.s }, p2/Z, [x20, #-1, MUL VL]\n"
+ "fadd z2.s, z2.s, z5.s\n"
+ "fadd z1.s, z1.s, z7.s\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ "ld1w { z30.s }, p0/Z, [x20, #1, MUL VL]\n"
+ "fadd z26.s, z26.s, z6.s\n"
+ "fadd z25.s, z25.s, z5.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z28.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "fadd z24.s, z24.s, z7.s\n"
+ "fadd z23.s, z23.s, z6.s\n"
+ "ld1w { z27.s }, p0/Z, [x20, #4, MUL VL]\n"
+ "fadd z22.s, z22.s, z5.s\n"
+ "fadd z0.s, z0.s, z7.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fadd z31.s, z31.s, z6.s\n"
+ "fadd z30.s, z30.s, z5.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fadd z29.s, z29.s, z7.s\n"
+ "fadd z28.s, z28.s, z6.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fadd z27.s, z27.s, z5.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmin z4.s, p3/M, z4.s, z12.s\n"
+ "fmin z3.s, p3/M, z3.s, z12.s\n"
+ "fmin z2.s, p3/M, z2.s, z12.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z4.s, p3/M, z4.s, z11.s\n"
+ "fmax z3.s, p3/M, z3.s, z11.s\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "fmax z2.s, p3/M, z2.s, z11.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "st1h { z19.s }, p2, [x10]\n"
+ ".inst 0x658aac93 // bfcvt z19.h, p3/M, z4.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ "st1h { z18.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aac72 // bfcvt z18.h, p3/M, z3.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aac51 // bfcvt z17.h, p3/M, z2.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "st1h { z16.s }, p2, [x28]\n"
+ ".inst 0x658aac30 // bfcvt z16.h, p3/M, z1.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "cmp x11, XZR\n"
+ "st1h { z21.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aaf5a // bfcvt z26.h, p3/M, z26.s\n"
+ "st1h { z20.s }, p0, [x28, #2, MUL VL]\n"
+ ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n"
+ ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z19.s }, p2, [x27]\n"
+ ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n"
+ ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z18.s }, p1, [x27, #1, MUL VL]\n"
+ ".inst 0x658aac15 // bfcvt z21.h, p3/M, z0.s\n"
+ ".inst 0x658aaff4 // bfcvt z20.h, p3/M, z31.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z17.s }, p0, [x27, #2, MUL VL]\n"
+ ".inst 0x658aafd3 // bfcvt z19.h, p3/M, z30.s\n"
+ ".inst 0x658aafb2 // bfcvt z18.h, p3/M, z29.s\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z16.s }, p2, [x26]\n"
+ ".inst 0x658aaf91 // bfcvt z17.h, p3/M, z28.s\n"
+ ".inst 0x658aaf70 // bfcvt z16.h, p3/M, z27.s\n"
+ "st1h { z26.s }, p1, [x26, #1, MUL VL]\n"
+ "st1h { z25.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z24.s }, p2, [x25]\n"
+ "st1h { z23.s }, p1, [x25, #1, MUL VL]\n"
+ "st1h { z22.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "st1h { z21.s }, p2, [x24]\n"
+ "st1h { z20.s }, p1, [x24, #1, MUL VL]\n"
+ "st1h { z19.s }, p0, [x24, #2, MUL VL]\n"
+ "inch x24, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x23]\n"
+ "st1h { z17.s }, p1, [x23, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x23, #2, MUL VL]\n"
+ "inch x23, ALL, MUL #3\n"
+ "bgt 27b\n"
+ "b 52f\n"
+ "30:" // Initial: Height 8
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "mov x9, %x[bias]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "add x24, x25, %x[ldout], LSL #1\n"
+ "add x23, x24, %x[ldout], LSL #1\n"
+ "add x22, x23, %x[ldout], LSL #1\n"
+ "31:" // Initial: Height 8: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "whilelt p0.s, x21, x11\n"
+ "incw x21\n"
+ "cbnz %x[bias], 32f\n"
+ "mov z10.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z8.b, #0x0\n"
+ "b 33f\n"
+ "32:" // Initial: Height 8: Width 3: bias
+ "ld1h { z18.s }, p2/Z, [x9]\n"
+ "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n"
+ "lsl z10.s, z18.s, #0x10\n"
+ "lsl z9.s, z17.s, #0x10\n"
+ "lsl z8.s, z16.s, #0x10\n"
+ "33:" // Initial: Height 8: Width 3: init done
+ "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "inch x9, ALL, MUL #3\n"
+ "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z6.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z21.s, z21.s, z10.s\n"
+ "fadd z20.s, z20.s, z9.s\n"
+ "ld1w { z5.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "fadd z19.s, z19.s, z8.s\n"
+ "fadd z18.s, z18.s, z10.s\n"
+ "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "ld1w { z2.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fadd z17.s, z17.s, z9.s\n"
+ "fadd z16.s, z16.s, z8.s\n"
+ "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "fadd z7.s, z7.s, z10.s\n"
+ "fadd z6.s, z6.s, z9.s\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n"
+ "fadd z5.s, z5.s, z8.s\n"
+ "fadd z4.s, z4.s, z10.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n"
+ "fadd z3.s, z3.s, z9.s\n"
+ "fadd z2.s, z2.s, z8.s\n"
+ "ld1w { z1.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "fadd z27.s, z27.s, z10.s\n"
+ "fadd z26.s, z26.s, z9.s\n"
+ "ld1w { z31.s }, p0/Z, [x20, #4, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #5, MUL VL]\n"
+ "fadd z25.s, z25.s, z8.s\n"
+ "fadd z24.s, z24.s, z10.s\n"
+ "ld1w { z29.s }, p1/Z, [x20, #6, MUL VL]\n"
+ "ld1w { z28.s }, p0/Z, [x20, #7, MUL VL]\n"
+ "fadd z23.s, z23.s, z9.s\n"
+ "fadd z22.s, z22.s, z8.s\n"
+ "fadd z1.s, z1.s, z10.s\n"
+ "fadd z0.s, z0.s, z9.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fadd z31.s, z31.s, z8.s\n"
+ "fadd z30.s, z30.s, z10.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fadd z29.s, z29.s, z9.s\n"
+ "fadd z28.s, z28.s, z8.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fmin z16.s, p3/M, z16.s, z12.s\n"
+ "fmin z7.s, p3/M, z7.s, z12.s\n"
+ "fmin z6.s, p3/M, z6.s, z12.s\n"
+ "fmin z5.s, p3/M, z5.s, z12.s\n"
+ "fmin z4.s, p3/M, z4.s, z12.s\n"
+ "fmin z3.s, p3/M, z3.s, z12.s\n"
+ "fmin z2.s, p3/M, z2.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "fmax z16.s, p3/M, z16.s, z11.s\n"
+ "fmax z7.s, p3/M, z7.s, z11.s\n"
+ "fmax z6.s, p3/M, z6.s, z11.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n"
+ "fmax z5.s, p3/M, z5.s, z11.s\n"
+ "fmax z4.s, p3/M, z4.s, z11.s\n"
+ ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "fmax z3.s, p3/M, z3.s, z11.s\n"
+ "fmax z2.s, p3/M, z2.s, z11.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "st1h { z21.s }, p2, [x10]\n"
+ ".inst 0x658aacf5 // bfcvt z21.h, p3/M, z7.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "st1h { z20.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aacd4 // bfcvt z20.h, p3/M, z6.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ "st1h { z19.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aacb3 // bfcvt z19.h, p3/M, z5.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ "st1h { z18.s }, p2, [x28]\n"
+ ".inst 0x658aac92 // bfcvt z18.h, p3/M, z4.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aac71 // bfcvt z17.h, p3/M, z3.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ ".inst 0x658aac50 // bfcvt z16.h, p3/M, z2.s\n"
+ "cmp x11, XZR\n"
+ "st1h { z21.s }, p2, [x27]\n"
+ ".inst 0x658aaf7b // bfcvt z27.h, p3/M, z27.s\n"
+ ".inst 0x658aaf5a // bfcvt z26.h, p3/M, z26.s\n"
+ "st1h { z20.s }, p1, [x27, #1, MUL VL]\n"
+ ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n"
+ ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z19.s }, p0, [x27, #2, MUL VL]\n"
+ ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n"
+ ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x26]\n"
+ ".inst 0x658aac35 // bfcvt z21.h, p3/M, z1.s\n"
+ ".inst 0x658aac14 // bfcvt z20.h, p3/M, z0.s\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+ ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n"
+ ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z16.s }, p0, [x26, #2, MUL VL]\n"
+ ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n"
+ ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z27.s }, p2, [x25]\n"
+ "st1h { z26.s }, p1, [x25, #1, MUL VL]\n"
+ "st1h { z25.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "st1h { z24.s }, p2, [x24]\n"
+ "st1h { z23.s }, p1, [x24, #1, MUL VL]\n"
+ "st1h { z22.s }, p0, [x24, #2, MUL VL]\n"
+ "inch x24, ALL, MUL #3\n"
+ "st1h { z21.s }, p2, [x23]\n"
+ "st1h { z20.s }, p1, [x23, #1, MUL VL]\n"
+ "st1h { z19.s }, p0, [x23, #2, MUL VL]\n"
+ "inch x23, ALL, MUL #3\n"
+ "st1h { z18.s }, p2, [x22]\n"
+ "st1h { z17.s }, p1, [x22, #1, MUL VL]\n"
+ "st1h { z16.s }, p0, [x22, #2, MUL VL]\n"
+ "inch x22, ALL, MUL #3\n"
+ "bgt 31b\n"
+ "subs %x[rows], %x[rows], #0x8\n"
+ "add %x[out_ptr], %x[out_ptr], x12\n"
+ "bgt 1b\n"
+ "b 52f\n"
+ "34:" // Accumulate
+ "35:" // Accumulate: Row loop
+ "cmp %x[rows], #0x7\n"
+ "bgt 50f\n"
+ "beq 48f\n"
+ "cmp %x[rows], #0x5\n"
+ "bgt 46f\n"
+ "beq 44f\n"
+ "cmp %x[rows], #0x3\n"
+ "bgt 42f\n"
+ "beq 40f\n"
+ "cmp %x[rows], #0x1\n"
+ "bgt 38f\n"
+ "36:" // Accumulate: Height 1
+ "mov x11, %x[cols]\n"
+ "mov x10, %x[out_ptr]\n"
+ "37:" // Accumulate: Height 1: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z16.s }, p2/Z, [x10]\n"
+ "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "fadd z19.s, z19.s, z16.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "whilelt p0.s, x21, x11\n"
+ "decw x11, ALL, MUL #3\n"
+ "incw x21\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "cmp x11, XZR\n"
+ ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n"
+ "st1h { z16.s }, p2, [x10]\n"
+ "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z18.s, z18.s, z16.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z17.s, z17.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ ".inst 0x658aae30 // bfcvt z16.h, p3/M, z17.s\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ "bgt 37b\n"
+ "b 52f\n"
+ "38:" // Accumulate: Height 2
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "39:" // Accumulate: Height 2: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z17.s }, p2/Z, [x10]\n"
+ "ld1h { z16.s }, p2/Z, [x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "ld1w { z22.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "fadd z23.s, z23.s, z17.s\n"
+ "fadd z22.s, z22.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "ld1w { z21.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "whilelt p0.s, x21, x11\n"
+ "decw x11, ALL, MUL #3\n"
+ "incw x21\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "cmp x11, XZR\n"
+ ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n"
+ ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n"
+ "st1h { z17.s }, p2, [x10]\n"
+ "st1h { z16.s }, p2, [x28]\n"
+ "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z21.s, z21.s, z17.s\n"
+ "fadd z20.s, z20.s, z16.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n"
+ "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z16.s }, p1, [x28, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z19.s, z19.s, z17.s\n"
+ "fadd z18.s, z18.s, z16.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ "bgt 39b\n"
+ "b 52f\n"
+ "40:" // Accumulate: Height 3
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "41:" // Accumulate: Height 3: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z18.s }, p2/Z, [x10]\n"
+ "ld1h { z17.s }, p2/Z, [x28]\n"
+ "ld1h { z16.s }, p2/Z, [x27]\n"
+ "ld1w { z26.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z19.s, z18.s, #0x10\n"
+ "ld1w { z25.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "ld1w { z18.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z26.s, z26.s, z19.s\n"
+ "fadd z25.s, z25.s, z17.s\n"
+ "ld1w { z24.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z22.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "fadd z18.s, z18.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "whilelt p0.s, x21, x11\n"
+ "decw x11, ALL, MUL #3\n"
+ "incw x21\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "cmp x11, XZR\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n"
+ ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n"
+ "st1h { z17.s }, p2, [x10]\n"
+ "st1h { z16.s }, p2, [x28]\n"
+ ".inst 0x658aae51 // bfcvt z17.h, p3/M, z18.s\n"
+ "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "st1h { z17.s }, p2, [x27]\n"
+ "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "lsl z18.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z24.s, z24.s, z18.s\n"
+ "fadd z23.s, z23.s, z17.s\n"
+ "fadd z22.s, z22.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aaf10 // bfcvt z16.h, p3/M, z24.s\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n"
+ ".inst 0x658aaed1 // bfcvt z17.h, p3/M, z22.s\n"
+ "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z18.s }, p1, [x28, #1, MUL VL]\n"
+ "st1h { z17.s }, p1, [x27, #1, MUL VL]\n"
+ "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "lsl z18.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z21.s, z21.s, z18.s\n"
+ "fadd z20.s, z20.s, z17.s\n"
+ "fadd z19.s, z19.s, z16.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ ".inst 0x658aae91 // bfcvt z17.h, p3/M, z20.s\n"
+ ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n"
+ "st1h { z17.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "bgt 41b\n"
+ "b 52f\n"
+ "42:" // Accumulate: Height 4
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "43:" // Accumulate: Height 4: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z19.s }, p2/Z, [x10]\n"
+ "ld1h { z18.s }, p2/Z, [x28]\n"
+ "ld1h { z17.s }, p2/Z, [x27]\n"
+ "ld1h { z16.s }, p2/Z, [x26]\n"
+ "ld1w { z30.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z20.s, z19.s, #0x10\n"
+ "ld1w { z29.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "ld1w { z28.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z30.s, z30.s, z20.s\n"
+ "fadd z29.s, z29.s, z18.s\n"
+ "ld1w { z27.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z26.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "ld1w { z25.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "ld1w { z24.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "whilelt p0.s, x21, x11\n"
+ "decw x11, ALL, MUL #3\n"
+ "fadd z28.s, z28.s, z17.s\n"
+ "fadd z19.s, z19.s, z16.s\n"
+ "incw x21\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "ld1w { z23.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "ld1w { z21.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z20.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "cmp x11, XZR\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z19.s, p3/M, z19.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "fmax z19.s, p3/M, z19.s, z11.s\n"
+ ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n"
+ ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n"
+ ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n"
+ "st1h { z18.s }, p2, [x10]\n"
+ "st1h { z17.s }, p2, [x28]\n"
+ ".inst 0x658aae71 // bfcvt z17.h, p3/M, z19.s\n"
+ "st1h { z16.s }, p2, [x27]\n"
+ "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "st1h { z17.s }, p2, [x26]\n"
+ "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "lsl z19.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "fadd z27.s, z27.s, z19.s\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z26.s, z26.s, z18.s\n"
+ "fadd z25.s, z25.s, z17.s\n"
+ "fadd z24.s, z24.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ ".inst 0x658aaf71 // bfcvt z17.h, p3/M, z27.s\n"
+ ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n"
+ "st1h { z17.s }, p1, [x10, #1, MUL VL]\n"
+ "st1h { z16.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n"
+ ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n"
+ "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z18.s }, p1, [x27, #1, MUL VL]\n"
+ "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+ "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "lsl z19.s, z16.s, #0x10\n"
+ "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "fadd z23.s, z23.s, z19.s\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z22.s, z22.s, z18.s\n"
+ "fadd z21.s, z21.s, z17.s\n"
+ "fadd z20.s, z20.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n"
+ ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n"
+ "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ ".inst 0x658aaeb1 // bfcvt z17.h, p3/M, z21.s\n"
+ ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n"
+ "st1h { z17.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "bgt 43b\n"
+ "b 52f\n"
+ "44:" // Accumulate: Height 5
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "45:" // Accumulate: Height 5: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z20.s }, p2/Z, [x10]\n"
+ "ld1h { z19.s }, p2/Z, [x28]\n"
+ "ld1h { z18.s }, p2/Z, [x27]\n"
+ "ld1h { z17.s }, p2/Z, [x26]\n"
+ "ld1h { z16.s }, p2/Z, [x25]\n"
+ "ld1w { z1.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z22.s, z20.s, #0x10\n"
+ "ld1w { z0.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "lsl z21.s, z19.s, #0x10\n"
+ "ld1w { z31.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "whilelt p1.s, x21, x11\n"
+ "lsl z19.s, z18.s, #0x10\n"
+ "ld1w { z20.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "lsl z18.s, z17.s, #0x10\n"
+ "ld1w { z17.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z1.s, z1.s, z22.s\n"
+ "incw x21\n"
+ "fadd z0.s, z0.s, z21.s\n"
+ "ld1w { z30.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z29.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "fadd z31.s, z31.s, z19.s\n"
+ "fadd z20.s, z20.s, z18.s\n"
+ "ld1w { z28.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "fadd z17.s, z17.s, z16.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "whilelt p0.s, x21, x11\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmin z20.s, p3/M, z20.s, z12.s\n"
+ "fmin z17.s, p3/M, z17.s, z12.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z25.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z24.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ "ld1w { z23.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fmax z20.s, p3/M, z20.s, z11.s\n"
+ "fmax z17.s, p3/M, z17.s, z11.s\n"
+ "ld1w { z21.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ ".inst 0x658aac30 // bfcvt z16.h, p3/M, z1.s\n"
+ "cmp x11, XZR\n"
+ "incw x21\n"
+ ".inst 0x658aac13 // bfcvt z19.h, p3/M, z0.s\n"
+ ".inst 0x658aaff2 // bfcvt z18.h, p3/M, z31.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z16.s }, p2, [x10]\n"
+ ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n"
+ ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n"
+ "st1h { z19.s }, p2, [x28]\n"
+ "st1h { z18.s }, p2, [x27]\n"
+ "st1h { z16.s }, p2, [x26]\n"
+ "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "st1h { z17.s }, p2, [x25]\n"
+ "ld1h { z19.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z18.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z17.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "lsl z20.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z30.s, z30.s, z20.s\n"
+ "fadd z29.s, z29.s, z19.s\n"
+ "fadd z28.s, z28.s, z18.s\n"
+ "fadd z27.s, z27.s, z17.s\n"
+ "fadd z26.s, z26.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n"
+ ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n"
+ ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n"
+ "st1h { z18.s }, p1, [x10, #1, MUL VL]\n"
+ "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n"
+ ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n"
+ "st1h { z16.s }, p1, [x27, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z18.s }, p1, [x26, #1, MUL VL]\n"
+ "st1h { z17.s }, p1, [x25, #1, MUL VL]\n"
+ "ld1h { z19.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z18.s }, p0/Z, [x27, #2, MUL VL]\n"
+ "lsl z20.s, z16.s, #0x10\n"
+ "ld1h { z17.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z25.s, z25.s, z20.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z24.s, z24.s, z19.s\n"
+ "fadd z23.s, z23.s, z18.s\n"
+ "fadd z22.s, z22.s, z17.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fadd z21.s, z21.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aaf31 // bfcvt z17.h, p3/M, z25.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ ".inst 0x658aaf10 // bfcvt z16.h, p3/M, z24.s\n"
+ "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n"
+ ".inst 0x658aaed1 // bfcvt z17.h, p3/M, z22.s\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n"
+ "st1h { z18.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z17.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "bgt 45b\n"
+ "b 52f\n"
+ "46:" // Accumulate: Height 6
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "add x24, x25, %x[ldout], LSL #1\n"
+ "47:" // Accumulate: Height 6: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z21.s }, p2/Z, [x10]\n"
+ "ld1h { z20.s }, p2/Z, [x28]\n"
+ "ld1h { z19.s }, p2/Z, [x27]\n"
+ "ld1h { z18.s }, p2/Z, [x26]\n"
+ "ld1h { z17.s }, p2/Z, [x25]\n"
+ "ld1h { z16.s }, p2/Z, [x24]\n"
+ "ld1w { z6.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z22.s, z21.s, #0x10\n"
+ "ld1w { z5.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "lsl z21.s, z20.s, #0x10\n"
+ "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "lsl z20.s, z19.s, #0x10\n"
+ "ld1w { z3.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "lsl z19.s, z18.s, #0x10\n"
+ "ld1w { z2.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "ld1w { z18.s }, p2/Z, [x20, #-1, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z6.s, z6.s, z22.s\n"
+ "fadd z5.s, z5.s, z21.s\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "fadd z4.s, z4.s, z20.s\n"
+ "fadd z3.s, z3.s, z19.s\n"
+ "fadd z2.s, z2.s, z17.s\n"
+ "fadd z18.s, z18.s, z16.s\n"
+ "fmin z6.s, p3/M, z6.s, z12.s\n"
+ "fmin z5.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z1.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z0.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "whilelt p0.s, x21, x11\n"
+ "decw x11, ALL, MUL #3\n"
+ "fmin z4.s, p3/M, z4.s, z12.s\n"
+ "fmin z3.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z31.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "ld1w { z30.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "fmin z2.s, p3/M, z2.s, z12.s\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "ld1w { z29.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "fmax z6.s, p3/M, z6.s, z11.s\n"
+ "fmax z5.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z27.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z26.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "fmax z4.s, p3/M, z4.s, z11.s\n"
+ "fmax z3.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "ld1w { z24.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "fmax z2.s, p3/M, z2.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "ld1w { z23.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n"
+ ".inst 0x658aacd5 // bfcvt z21.h, p3/M, z6.s\n"
+ ".inst 0x658aacb4 // bfcvt z20.h, p3/M, z5.s\n"
+ "cmp x11, XZR\n"
+ "incw x21\n"
+ ".inst 0x658aac93 // bfcvt z19.h, p3/M, z4.s\n"
+ ".inst 0x658aac71 // bfcvt z17.h, p3/M, z3.s\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ ".inst 0x658aac50 // bfcvt z16.h, p3/M, z2.s\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ "st1h { z21.s }, p2, [x10]\n"
+ "st1h { z20.s }, p2, [x28]\n"
+ "st1h { z19.s }, p2, [x27]\n"
+ "st1h { z17.s }, p2, [x26]\n"
+ "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "st1h { z16.s }, p2, [x25]\n"
+ "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "st1h { z18.s }, p2, [x24]\n"
+ "ld1h { z19.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "lsl z21.s, z17.s, #0x10\n"
+ "ld1h { z17.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "lsl z20.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p1/Z, [x24, #1, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z1.s, z1.s, z21.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z0.s, z0.s, z20.s\n"
+ "fadd z31.s, z31.s, z19.s\n"
+ "fadd z30.s, z30.s, z18.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "fadd z29.s, z29.s, z17.s\n"
+ "fadd z28.s, z28.s, z16.s\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ ".inst 0x658aac34 // bfcvt z20.h, p3/M, z1.s\n"
+ ".inst 0x658aac12 // bfcvt z18.h, p3/M, z0.s\n"
+ ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n"
+ ".inst 0x658aafd1 // bfcvt z17.h, p3/M, z30.s\n"
+ ".inst 0x658aafb0 // bfcvt z16.h, p3/M, z29.s\n"
+ "st1h { z20.s }, p1, [x10, #1, MUL VL]\n"
+ "st1h { z18.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aaf92 // bfcvt z18.h, p3/M, z28.s\n"
+ "st1h { z19.s }, p1, [x27, #1, MUL VL]\n"
+ "st1h { z17.s }, p1, [x26, #1, MUL VL]\n"
+ "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z16.s }, p1, [x25, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "st1h { z18.s }, p1, [x24, #1, MUL VL]\n"
+ "ld1h { z19.s }, p0/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z18.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "lsl z21.s, z17.s, #0x10\n"
+ "ld1h { z17.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "lsl z20.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p0/Z, [x24, #2, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z27.s, z27.s, z21.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z26.s, z26.s, z20.s\n"
+ "fadd z25.s, z25.s, z19.s\n"
+ "fadd z24.s, z24.s, z18.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fadd z23.s, z23.s, z17.s\n"
+ "fadd z22.s, z22.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aaf74 // bfcvt z20.h, p3/M, z27.s\n"
+ ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n"
+ ".inst 0x658aaf33 // bfcvt z19.h, p3/M, z25.s\n"
+ ".inst 0x658aaf12 // bfcvt z18.h, p3/M, z24.s\n"
+ ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n"
+ "st1h { z20.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x28, #2, MUL VL]\n"
+ ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z19.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z18.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z17.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x24, #2, MUL VL]\n"
+ "inch x24, ALL, MUL #3\n"
+ "bgt 47b\n"
+ "b 52f\n"
+ "48:" // Accumulate: Height 7
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "add x24, x25, %x[ldout], LSL #1\n"
+ "add x23, x24, %x[ldout], LSL #1\n"
+ "49:" // Accumulate: Height 7: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z22.s }, p2/Z, [x10]\n"
+ "ld1h { z21.s }, p2/Z, [x28]\n"
+ "ld1h { z20.s }, p2/Z, [x27]\n"
+ "ld1h { z19.s }, p2/Z, [x26]\n"
+ "ld1h { z18.s }, p2/Z, [x25]\n"
+ "ld1h { z17.s }, p2/Z, [x24]\n"
+ "ld1h { z16.s }, p2/Z, [x23]\n"
+ "ld1w { z8.s }, p2/Z, [%x[in_ptr]]\n"
+ "lsl z25.s, z22.s, #0x10\n"
+ "lsl z24.s, z21.s, #0x10\n"
+ "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "lsl z20.s, z20.s, #0x10\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "ld1w { z23.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "ld1w { z5.s }, p2/Z, [x20, #-1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z8.s, z8.s, z25.s\n"
+ "fadd z21.s, z21.s, z24.s\n"
+ "fadd z7.s, z7.s, z20.s\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "fadd z23.s, z23.s, z19.s\n"
+ "fadd z6.s, z6.s, z18.s\n"
+ "fadd z5.s, z5.s, z17.s\n"
+ "fadd z22.s, z22.s, z16.s\n"
+ "fmin z8.s, p3/M, z8.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z7.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z4.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z3.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "whilelt p0.s, x21, x11\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z6.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z2.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "ld1w { z1.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "fmin z5.s, p3/M, z5.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "ld1w { z0.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ "fmax z8.s, p3/M, z8.s, z11.s\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "ld1w { z30.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z29.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "fmax z7.s, p3/M, z7.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "ld1w { z28.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "fmax z6.s, p3/M, z6.s, z11.s\n"
+ "fmax z5.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z26.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aad13 // bfcvt z19.h, p3/M, z8.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ "ld1w { z24.s }, p0/Z, [x20, #1, MUL VL]\n"
+ ".inst 0x658aacf4 // bfcvt z20.h, p3/M, z7.s\n"
+ ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n"
+ "ld1w { z23.s }, p0/Z, [x20, #4, MUL VL]\n"
+ "decw x11, ALL, MUL #3\n"
+ ".inst 0x658aacd1 // bfcvt z17.h, p3/M, z6.s\n"
+ ".inst 0x658aacb0 // bfcvt z16.h, p3/M, z5.s\n"
+ "incw x21\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z19.s }, p2, [x10]\n"
+ ".inst 0x658aaed3 // bfcvt z19.h, p3/M, z22.s\n"
+ "st1h { z21.s }, p2, [x28]\n"
+ "cmp x11, XZR\n"
+ "st1h { z20.s }, p2, [x27]\n"
+ "st1h { z18.s }, p2, [x26]\n"
+ "ld1h { z18.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "st1h { z17.s }, p2, [x25]\n"
+ "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "st1h { z16.s }, p2, [x24]\n"
+ "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "st1h { z19.s }, p2, [x23]\n"
+ "ld1h { z19.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "lsl z22.s, z18.s, #0x10\n"
+ "ld1h { z18.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "lsl z21.s, z17.s, #0x10\n"
+ "ld1h { z17.s }, p1/Z, [x24, #1, MUL VL]\n"
+ "lsl z20.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p1/Z, [x23, #1, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z4.s, z4.s, z22.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z3.s, z3.s, z21.s\n"
+ "fadd z2.s, z2.s, z20.s\n"
+ "fadd z1.s, z1.s, z19.s\n"
+ "fadd z0.s, z0.s, z18.s\n"
+ "fadd z31.s, z31.s, z17.s\n"
+ "fmin z4.s, p3/M, z4.s, z12.s\n"
+ "fadd z30.s, z30.s, z16.s\n"
+ "fmin z3.s, p3/M, z3.s, z12.s\n"
+ "fmin z2.s, p3/M, z2.s, z12.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmax z4.s, p3/M, z4.s, z11.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fmax z3.s, p3/M, z3.s, z11.s\n"
+ "fmax z2.s, p3/M, z2.s, z11.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ ".inst 0x658aac90 // bfcvt z16.h, p3/M, z4.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ ".inst 0x658aac74 // bfcvt z20.h, p3/M, z3.s\n"
+ ".inst 0x658aac53 // bfcvt z19.h, p3/M, z2.s\n"
+ ".inst 0x658aac32 // bfcvt z18.h, p3/M, z1.s\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aac11 // bfcvt z17.h, p3/M, z0.s\n"
+ ".inst 0x658aaff0 // bfcvt z16.h, p3/M, z31.s\n"
+ "st1h { z20.s }, p1, [x28, #1, MUL VL]\n"
+ "st1h { z19.s }, p1, [x27, #1, MUL VL]\n"
+ ".inst 0x658aafd3 // bfcvt z19.h, p3/M, z30.s\n"
+ "st1h { z18.s }, p1, [x26, #1, MUL VL]\n"
+ "ld1h { z18.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z17.s }, p1, [x25, #1, MUL VL]\n"
+ "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "st1h { z16.s }, p1, [x24, #1, MUL VL]\n"
+ "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n"
+ "st1h { z19.s }, p1, [x23, #1, MUL VL]\n"
+ "ld1h { z19.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "lsl z22.s, z18.s, #0x10\n"
+ "ld1h { z18.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "lsl z21.s, z17.s, #0x10\n"
+ "ld1h { z17.s }, p0/Z, [x24, #2, MUL VL]\n"
+ "lsl z20.s, z16.s, #0x10\n"
+ "ld1h { z16.s }, p0/Z, [x23, #2, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z29.s, z29.s, z22.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z28.s, z28.s, z21.s\n"
+ "fadd z27.s, z27.s, z20.s\n"
+ "fadd z26.s, z26.s, z19.s\n"
+ "fadd z25.s, z25.s, z18.s\n"
+ "fadd z24.s, z24.s, z17.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fadd z23.s, z23.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ ".inst 0x658aaf94 // bfcvt z20.h, p3/M, z28.s\n"
+ ".inst 0x658aaf70 // bfcvt z16.h, p3/M, z27.s\n"
+ ".inst 0x658aaf53 // bfcvt z19.h, p3/M, z26.s\n"
+ "st1h { z17.s }, p0, [x10, #2, MUL VL]\n"
+ "inch x10, ALL, MUL #3\n"
+ ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n"
+ ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n"
+ "st1h { z20.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+ ".inst 0x658aaef0 // bfcvt z16.h, p3/M, z23.s\n"
+ "inch x27, ALL, MUL #3\n"
+ "st1h { z19.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z18.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "st1h { z17.s }, p0, [x24, #2, MUL VL]\n"
+ "inch x24, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x23, #2, MUL VL]\n"
+ "inch x23, ALL, MUL #3\n"
+ "bgt 49b\n"
+ "b 52f\n"
+ "50:" // Accumulate: Height 8
+ "mov x10, %x[out_ptr]\n"
+ "mov x11, %x[cols]\n"
+ "add x28, x10, %x[ldout], LSL #1\n"
+ "add x27, x28, %x[ldout], LSL #1\n"
+ "add x26, x27, %x[ldout], LSL #1\n"
+ "add x25, x26, %x[ldout], LSL #1\n"
+ "add x24, x25, %x[ldout], LSL #1\n"
+ "add x23, x24, %x[ldout], LSL #1\n"
+ "add x22, x23, %x[ldout], LSL #1\n"
+ "51:" // Accumulate: Height 8: Block loop
+ "mov x21, #0x0\n"
+ "addvl x20, %x[in_ptr], #16\n"
+ "whilelt p2.s, x21, x11\n"
+ "incw x21\n"
+ "ld1h { z23.s }, p2/Z, [x10]\n"
+ "ld1h { z22.s }, p2/Z, [x28]\n"
+ "ld1h { z21.s }, p2/Z, [x27]\n"
+ "ld1h { z20.s }, p2/Z, [x26]\n"
+ "ld1h { z19.s }, p2/Z, [x25]\n"
+ "ld1h { z18.s }, p2/Z, [x24]\n"
+ "ld1h { z17.s }, p2/Z, [x23]\n"
+ "ld1h { z16.s }, p2/Z, [x22]\n"
+ "lsl z31.s, z23.s, #0x10\n"
+ "lsl z30.s, z22.s, #0x10\n"
+ "ld1w { z29.s }, p2/Z, [%x[in_ptr]]\n"
+ "ld1w { z28.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n"
+ "lsl z27.s, z21.s, #0x10\n"
+ "lsl z26.s, z20.s, #0x10\n"
+ "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n"
+ "ld1w { z25.s }, p2/Z, [x20, #-7, MUL VL]\n"
+ "lsl z20.s, z19.s, #0x10\n"
+ "lsl z19.s, z18.s, #0x10\n"
+ "ld1w { z18.s }, p2/Z, [x20, #-4, MUL VL]\n"
+ "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "ld1w { z23.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x20, #5, MUL VL]\n"
+ "fadd z29.s, z29.s, z31.s\n"
+ "fadd z28.s, z28.s, z30.s\n"
+ "fadd z21.s, z21.s, z27.s\n"
+ "fadd z25.s, z25.s, z26.s\n"
+ "whilelt p1.s, x21, x11\n"
+ "incw x21\n"
+ "fadd z18.s, z18.s, z20.s\n"
+ "fadd z24.s, z24.s, z19.s\n"
+ "fadd z23.s, z23.s, z17.s\n"
+ "fadd z22.s, z22.s, z16.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fmin z21.s, p3/M, z21.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "ld1w { z6.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n"
+ "ld1w { z5.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n"
+ "fmin z18.s, p3/M, z18.s, z12.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "ld1w { z4.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n"
+ "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmin z22.s, p3/M, z22.s, z12.s\n"
+ "ld1w { z2.s }, p1/Z, [x20, #-3, MUL VL]\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #6, MUL VL]\n"
+ "fmax z21.s, p3/M, z21.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ "fmax z18.s, p3/M, z18.s, z11.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z11.s\n"
+ ".inst 0x658aafb4 // bfcvt z20.h, p3/M, z29.s\n"
+ ".inst 0x658aaf93 // bfcvt z19.h, p3/M, z28.s\n"
+ ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n"
+ ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n"
+ "whilelt p0.s, x21, x11\n"
+ "decw x11, ALL, MUL #3\n"
+ ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n"
+ ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n"
+ "incw x21\n"
+ "st1h { z20.s }, p2, [x10]\n"
+ "st1h { z19.s }, p2, [x28]\n"
+ ".inst 0x658aaef4 // bfcvt z20.h, p3/M, z23.s\n"
+ ".inst 0x658aaed3 // bfcvt z19.h, p3/M, z22.s\n"
+ "st1h { z21.s }, p2, [x27]\n"
+ "ld1w { z30.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n"
+ "ld1w { z29.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n"
+ "cmp x11, XZR\n"
+ "st1h { z16.s }, p2, [x26]\n"
+ "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z28.s }, p0/Z, [x20, #-8, MUL VL]\n"
+ "addvl %x[in_ptr], %x[in_ptr], #24\n"
+ "st1h { z18.s }, p2, [x25]\n"
+ "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z27.s }, p0/Z, [x20, #-5, MUL VL]\n"
+ "st1h { z17.s }, p2, [x24]\n"
+ "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z26.s }, p0/Z, [x20, #-2, MUL VL]\n"
+ "st1h { z20.s }, p2, [x23]\n"
+ "ld1h { z20.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "ld1w { z25.s }, p0/Z, [x20, #1, MUL VL]\n"
+ "st1h { z19.s }, p2, [x22]\n"
+ "ld1h { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "lsl z22.s, z18.s, #0x10\n"
+ "ld1w { z24.s }, p0/Z, [x20, #4, MUL VL]\n"
+ "ld1h { z18.s }, p1/Z, [x24, #1, MUL VL]\n"
+ "lsl z21.s, z17.s, #0x10\n"
+ "ld1w { z23.s }, p0/Z, [x20, #7, MUL VL]\n"
+ "ld1h { z17.s }, p1/Z, [x23, #1, MUL VL]\n"
+ "lsl z20.s, z20.s, #0x10\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "ld1h { z16.s }, p1/Z, [x22, #1, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "fadd z5.s, z5.s, z22.s\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z4.s, z4.s, z21.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fmin z6.s, p3/M, z6.s, z12.s\n"
+ "fadd z3.s, z3.s, z20.s\n"
+ "fadd z2.s, z2.s, z19.s\n"
+ "fmin z5.s, p3/M, z5.s, z12.s\n"
+ "fadd z1.s, z1.s, z18.s\n"
+ "fmin z4.s, p3/M, z4.s, z12.s\n"
+ "fadd z0.s, z0.s, z17.s\n"
+ "fadd z31.s, z31.s, z16.s\n"
+ "fmax z6.s, p3/M, z6.s, z11.s\n"
+ "fmin z3.s, p3/M, z3.s, z12.s\n"
+ "fmin z2.s, p3/M, z2.s, z12.s\n"
+ "fmax z5.s, p3/M, z5.s, z11.s\n"
+ "fmin z1.s, p3/M, z1.s, z12.s\n"
+ "fmin z0.s, p3/M, z0.s, z12.s\n"
+ "fmin z31.s, p3/M, z31.s, z12.s\n"
+ "fmax z4.s, p3/M, z4.s, z11.s\n"
+ ".inst 0x658aacd0 // bfcvt z16.h, p3/M, z6.s\n"
+ "fmax z3.s, p3/M, z3.s, z11.s\n"
+ "fmax z2.s, p3/M, z2.s, z11.s\n"
+ ".inst 0x658aacb1 // bfcvt z17.h, p3/M, z5.s\n"
+ "fmax z1.s, p3/M, z1.s, z11.s\n"
+ "fmax z0.s, p3/M, z0.s, z11.s\n"
+ "fmax z31.s, p3/M, z31.s, z11.s\n"
+ "st1h { z16.s }, p1, [x10, #1, MUL VL]\n"
+ ".inst 0x658aac90 // bfcvt z16.h, p3/M, z4.s\n"
+ "st1h { z17.s }, p1, [x28, #1, MUL VL]\n"
+ ".inst 0x658aac75 // bfcvt z21.h, p3/M, z3.s\n"
+ ".inst 0x658aac52 // bfcvt z18.h, p3/M, z2.s\n"
+ ".inst 0x658aac31 // bfcvt z17.h, p3/M, z1.s\n"
+ ".inst 0x658aac14 // bfcvt z20.h, p3/M, z0.s\n"
+ "st1h { z16.s }, p1, [x27, #1, MUL VL]\n"
+ ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n"
+ "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n"
+ "st1h { z21.s }, p1, [x26, #1, MUL VL]\n"
+ "st1h { z18.s }, p1, [x25, #1, MUL VL]\n"
+ "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n"
+ "st1h { z17.s }, p1, [x24, #1, MUL VL]\n"
+ "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n"
+ "st1h { z20.s }, p1, [x23, #1, MUL VL]\n"
+ "ld1h { z20.s }, p0/Z, [x26, #2, MUL VL]\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "st1h { z19.s }, p1, [x22, #1, MUL VL]\n"
+ "ld1h { z19.s }, p0/Z, [x25, #2, MUL VL]\n"
+ "lsl z22.s, z18.s, #0x10\n"
+ "ld1h { z18.s }, p0/Z, [x24, #2, MUL VL]\n"
+ "lsl z21.s, z17.s, #0x10\n"
+ "ld1h { z17.s }, p0/Z, [x23, #2, MUL VL]\n"
+ "lsl z20.s, z20.s, #0x10\n"
+ "fadd z30.s, z30.s, z16.s\n"
+ "ld1h { z16.s }, p0/Z, [x22, #2, MUL VL]\n"
+ "lsl z19.s, z19.s, #0x10\n"
+ "lsl z18.s, z18.s, #0x10\n"
+ "fadd z29.s, z29.s, z22.s\n"
+ "lsl z17.s, z17.s, #0x10\n"
+ "fadd z28.s, z28.s, z21.s\n"
+ "lsl z16.s, z16.s, #0x10\n"
+ "fadd z27.s, z27.s, z20.s\n"
+ "fmin z30.s, p3/M, z30.s, z12.s\n"
+ "fadd z26.s, z26.s, z19.s\n"
+ "fadd z25.s, z25.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z12.s\n"
+ "fadd z24.s, z24.s, z17.s\n"
+ "fmin z28.s, p3/M, z28.s, z12.s\n"
+ "fadd z23.s, z23.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z12.s\n"
+ "fmax z30.s, p3/M, z30.s, z11.s\n"
+ "fmin z26.s, p3/M, z26.s, z12.s\n"
+ "fmin z25.s, p3/M, z25.s, z12.s\n"
+ "fmax z29.s, p3/M, z29.s, z11.s\n"
+ "fmin z24.s, p3/M, z24.s, z12.s\n"
+ "fmin z23.s, p3/M, z23.s, z12.s\n"
+ "fmax z28.s, p3/M, z28.s, z11.s\n"
+ "fmax z27.s, p3/M, z27.s, z11.s\n"
+ ".inst 0x658aafd0 // bfcvt z16.h, p3/M, z30.s\n"
+ "fmax z26.s, p3/M, z26.s, z11.s\n"
+ "fmax z25.s, p3/M, z25.s, z11.s\n"
+ ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n"
+ "fmax z24.s, p3/M, z24.s, z11.s\n"
+ "fmax z23.s, p3/M, z23.s, z11.s\n"
+ "st1h { z16.s }, p0, [x10, #2, MUL VL]\n"
+ ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n"
+ ".inst 0x658aaf74 // bfcvt z20.h, p3/M, z27.s\n"
+ "inch x10, ALL, MUL #3\n"
+ "st1h { z17.s }, p0, [x28, #2, MUL VL]\n"
+ "inch x28, ALL, MUL #3\n"
+ ".inst 0x658aaf53 // bfcvt z19.h, p3/M, z26.s\n"
+ ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n"
+ "st1h { z16.s }, p0, [x27, #2, MUL VL]\n"
+ "inch x27, ALL, MUL #3\n"
+ ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n"
+ ".inst 0x658aaef0 // bfcvt z16.h, p3/M, z23.s\n"
+ "st1h { z20.s }, p0, [x26, #2, MUL VL]\n"
+ "inch x26, ALL, MUL #3\n"
+ "st1h { z19.s }, p0, [x25, #2, MUL VL]\n"
+ "inch x25, ALL, MUL #3\n"
+ "st1h { z18.s }, p0, [x24, #2, MUL VL]\n"
+ "inch x24, ALL, MUL #3\n"
+ "st1h { z17.s }, p0, [x23, #2, MUL VL]\n"
+ "inch x23, ALL, MUL #3\n"
+ "st1h { z16.s }, p0, [x22, #2, MUL VL]\n"
+ "inch x22, ALL, MUL #3\n"
+ "bgt 51b\n"
+ "subs %x[rows], %x[rows], #0x8\n"
+ "add %x[out_ptr], %x[out_ptr], x12\n"
+ "bgt 35b\n"
+ "52:" // Exit
+ : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows)
+ : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+#endif // ARM_COMPUTE_ENABLE_SVE