aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2022-06-01 11:47:14 +0100
committerViet-Hoa Do <viet-hoa.do@arm.com>2022-11-28 16:57:42 +0000
commit03b2971ac69a86f10a1566938d1a25afee15746c (patch)
treeaec7cfc047e1da278b4b71a706cda7b1b0faa158 /src/core/NEON/kernels/arm_gemm/kernels
parent7dc0234331f2150a6b4ac5c2b49de419870f7cf5 (diff)
downloadComputeLibrary-03b2971ac69a86f10a1566938d1a25afee15746c.tar.gz
Integrate SME2 kernels
* Add SME/SME2 detection. * Integrate SME2 implementation for: - Normal convolution - Winograd - Depthwise convolution - Pooling Resolves: COMPMID-5700 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I2f1ca1d05f8cfeee9309ed1c0a36096a4a6aad5c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8692 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp87
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp554
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp553
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp87
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp611
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp678
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp678
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp420
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp486
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp618
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp484
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp616
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp408
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp455
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp507
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp345
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp378
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp444
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp408
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp455
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp94
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp507
40 files changed, 11865 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
new file mode 100644
index 0000000000..f86bcebe64
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST \
+ const bfloat16 *, const bfloat16 *, \
+ float *, size_t, size_t, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_bf16fp32_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_bf16fp32_dot_16VL
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+
+ StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+ // Default to the generic kernel
+ kern_type kernel=sme2_gemv_bf16fp32_dot_16VL;
+ cls_sme2_gemv_bf16fp32_dot_16VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..26861fb931
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_bf16fp32_dot_16VL (
+ const bfloat16 *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const bfloat16 *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cntw x27, ALL, MUL #4\n"
+ "add x26, %x[N], x27\n"
+ "sub x26, x26, #0x1\n"
+ "udiv x26, x26, x27\n"
+ "add x21, x26, #0x3\n"
+ "and x21, x21, #0xfffffffffffffffc\n"
+ "mul x21, x21, x27\n"
+ "mul x21, x21, %x[K]\n"
+ "mov x9, #0x0\n"
+ "mov x25, %x[B_ptr]\n"
+ "mov x24, %x[output_ptr]\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "lsl x21, x21, #0x1\n"
+ "mov x20, #0x1\n"
+ "1:" // RHS size check loop
+ "cmp x21, #0x200000\n"
+ "blt 2f\n"
+ "tbnz x21, #0, 3f\n"
+ "lsr x21, x21, #0x1\n"
+ "lsl x20, x20, #0x1\n"
+ "b 1b\n"
+ "2:" // RHS do prefetch
+ "lsl x19, x21, #0x26\n"
+ "sub x20, x20, #0x1\n"
+ "lsl x20, x20, #0x16\n"
+ "orr x21, x21, x19\n"
+ "orr x21, x21, x20\n"
+ ".inst 0xf8b54b3a // rprfm pldonce, x21, [x25]\n"
+ "3:" // RHS prefetch exit
+ "mov x23, %x[bias]\n"
+ "4:" // Column loop
+ "cmp x26, #0x4\n"
+ "bge 28f\n"
+ "cmp x26, #0x2\n"
+ "bgt 20f\n"
+ "beq 12f\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x1\n"
+ "mov x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 5f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ "b 6f\n"
+ "5:" // Width 1: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "6:" // Width 1: setup done
+ "cmp x20, #0x8\n"
+ "ble 8f\n"
+ "7:" // Width 1: Multiply loop: Main loop head
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x8\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "cmp x20, #0x8\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 7b\n"
+ "8:" // Width 1: Multiply loop: Single iteration only
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "9:" // Width 1: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 10f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xa060c308 // st1w { z8.s-z11.s }, p8, [x24]\n"
+ "addvl x24, x24, #4\n"
+ "b 11f\n"
+ "10:" // Width 1: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c308 // st1w { z8.s-z11.s }, p8, [x24]\n"
+ "addvl x24, x24, #4\n"
+ "11:" // Width 1: Output done
+ "b 36f\n"
+ "12:" // Width 2
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x1\n"
+ "sub x19, %x[N], x27\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 13f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ "b 14f\n"
+ "13:" // Width 2: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "14:" // Width 2: setup done
+ "cmp x20, #0x8\n"
+ "ble 16f\n"
+ "15:" // Width 2: Multiply loop: Main loop head
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x8\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ "cmp x20, #0x8\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa041a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 15b\n"
+ "16:" // Width 2: Multiply loop: Single iteration only
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa041a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "17:" // Width 2: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 18f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+ ".inst 0xa061c314 // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+ "addvl x24, x24, #8\n"
+ "b 19f\n"
+ "18:" // Width 2: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c314 // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+ "addvl x24, x24, #8\n"
+ "19:" // Width 2: Output done
+ "b 36f\n"
+ "20:" // Width 3
+ "mov x19, #0x2\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x1\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 21f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ ".inst 0xa042c6e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ "b 22f\n"
+ "21:" // Width 3: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "22:" // Width 3: setup done
+ "cmp x20, #0x8\n"
+ "ble 24f\n"
+ "23:" // Width 3: Multiply loop: Main loop head
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x8\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ "cmp x20, #0x8\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ ".inst 0xa042a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xa042a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ ".inst 0xa042a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa041a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 23b\n"
+ "24:" // Width 3: Multiply loop: Single iteration only
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ ".inst 0xa042a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xa042a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ ".inst 0xa042a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa041a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "25:" // Width 3: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 26f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+ ".inst 0xa062c310 // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+ "addvl x24, x24, #12\n"
+ "b 27f\n"
+ "26:" // Width 3: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c310 // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+ "addvl x24, x24, #12\n"
+ "27:" // Width 3: Output done
+ "b 36f\n"
+ "28:" // Width 4
+ "mov x19, #0x3\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x1\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 29f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ ".inst 0xa042c6e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa043c6f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x23, x23, #16\n"
+ "b 30f\n"
+ "29:" // Width 4: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "30:" // Width 4: setup done
+ "cmp x20, #0x8\n"
+ "ble 32f\n"
+ "31:" // Width 4: Multiply loop: Main loop head
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x8\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ "cmp x20, #0x8\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ ".inst 0xa042a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+ ".inst 0xa043a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15ab21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xa042a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa043a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15ab61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ ".inst 0xa042a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ ".inst 0xa043a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa041a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa043a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15abe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 31b\n"
+ "32:" // Width 4: Multiply loop: Single iteration only
+ "whilelt p0.h, XZR, x20\n"
+ "ld1rqh { z10.h }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xa040a721 // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15ab018 // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab099 // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+ ".inst 0xa042a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab29a // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+ ".inst 0xa043a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15ab21b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab718 // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15ab619 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+ ".inst 0xa042a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15ab71a // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+ ".inst 0xa043a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15ab61b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc15ab998 // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+ ".inst 0xa041a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aba19 // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+ ".inst 0xa042a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abb9a // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+ ".inst 0xa043a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aba9b // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15abe18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa041a739 // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15abf19 // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15abe1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+ ".inst 0xa043a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15abe1b // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "33:" // Width 4: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 34f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+ ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+ ".inst 0xa062c710 // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc1a6c818 // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
+ ".inst 0xa063c318 // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+ "addvl x24, x24, #16\n"
+ "b 35f\n"
+ "34:" // Width 4: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c710 // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+ ".inst 0xa063c318 // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+ "addvl x24, x24, #16\n"
+ "35:" // Width 4: Output done
+ "subs x26, x26, #0x4\n"
+ "sub %x[N], %x[N], x27, LSL #2\n"
+ "bgt 4b\n"
+ "36:" // Exit
+ ".inst 0xd503467f // SMSTOP\n"
+ "ptrue p1.b\n"
+ : [N] "+&r" (N)
+ : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
new file mode 100644
index 0000000000..f33cb9a33d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST \
+ const float *, const float *, \
+ float *, size_t, size_t, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp32_mla_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp32_mla_16VL
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+
+ StdTransformsSME<operand_type, result_type, 1, 16, 1> transforms = {};
+
+
+ // Default to the generic kernel
+ kern_type kernel=sme2_gemv_fp32_mla_16VL;
+ cls_sme2_gemv_fp32_mla_16VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
new file mode 100644
index 0000000000..4c0ae2c6bd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp32_mla_16VL (
+ const float *A_ptr, const float *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cntw x27, ALL, MUL #4\n"
+ "add x26, %x[N], x27\n"
+ "sub x26, x26, #0x1\n"
+ "udiv x26, x26, x27\n"
+ "add x21, x26, #0x3\n"
+ "and x21, x21, #0xfffffffffffffffc\n"
+ "mul x21, x21, x27\n"
+ "mul x21, x21, %x[K]\n"
+ "mov x9, #0x0\n"
+ "mov x25, %x[B_ptr]\n"
+ "mov x24, %x[output_ptr]\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "lsl x21, x21, #0x2\n"
+ "mov x20, #0x1\n"
+ "1:" // RHS size check loop
+ "cmp x21, #0x200000\n"
+ "blt 2f\n"
+ "tbnz x21, #0, 3f\n"
+ "lsr x21, x21, #0x1\n"
+ "lsl x20, x20, #0x1\n"
+ "b 1b\n"
+ "2:" // RHS do prefetch
+ "lsl x19, x21, #0x26\n"
+ "sub x20, x20, #0x1\n"
+ "lsl x20, x20, #0x16\n"
+ "orr x21, x21, x19\n"
+ "orr x21, x21, x20\n"
+ ".inst 0xf8b54b3a // rprfm pldonce, x21, [x25]\n"
+ "3:" // RHS prefetch exit
+ "mov x23, %x[bias]\n"
+ "4:" // Column loop
+ "cmp x26, #0x4\n"
+ "bge 28f\n"
+ "cmp x26, #0x2\n"
+ "bgt 20f\n"
+ "beq 12f\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "mov x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 5f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ "b 6f\n"
+ "5:" // Width 1: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "6:" // Width 1: setup done
+ "cmp x20, #0x4\n"
+ "ble 8f\n"
+ "7:" // Width 1: Multiply loop: Main loop head
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x4\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ "cmp x20, #0x4\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 7b\n"
+ "8:" // Width 1: Multiply loop: Single iteration only
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "9:" // Width 1: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 10f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xa060c308 // st1w { z8.s-z11.s }, p8, [x24]\n"
+ "addvl x24, x24, #4\n"
+ "b 11f\n"
+ "10:" // Width 1: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c308 // st1w { z8.s-z11.s }, p8, [x24]\n"
+ "addvl x24, x24, #4\n"
+ "11:" // Width 1: Output done
+ "b 36f\n"
+ "12:" // Width 2
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "sub x19, %x[N], x27\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 13f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ "b 14f\n"
+ "13:" // Width 2: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "14:" // Width 2: setup done
+ "cmp x20, #0x4\n"
+ "ble 16f\n"
+ "15:" // Width 2: Multiply loop: Main loop head
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x4\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ "cmp x20, #0x4\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa041c725 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa041c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 15b\n"
+ "16:" // Width 2: Multiply loop: Single iteration only
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ ".inst 0xa041c725 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa041c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "17:" // Width 2: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 18f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+ ".inst 0xa061c314 // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+ "addvl x24, x24, #8\n"
+ "b 19f\n"
+ "18:" // Width 2: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c314 // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+ "addvl x24, x24, #8\n"
+ "19:" // Width 2: Output done
+ "b 36f\n"
+ "20:" // Width 3
+ "mov x19, #0x2\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 21f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ ".inst 0xa042c6e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ "b 22f\n"
+ "21:" // Width 3: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "22:" // Width 3: setup done
+ "cmp x20, #0x4\n"
+ "ble 24f\n"
+ "23:" // Width 3: Multiply loop: Main loop head
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x4\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ "cmp x20, #0x4\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa041c725 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ ".inst 0xa042c735 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xa042c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ ".inst 0xa042c73d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa041c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa042c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 23b\n"
+ "24:" // Width 3: Multiply loop: Single iteration only
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ ".inst 0xa041c725 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ ".inst 0xa042c735 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xa042c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ ".inst 0xa042c73d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa041c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa042c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "25:" // Width 3: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 26f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+ ".inst 0xa062c310 // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+ "addvl x24, x24, #12\n"
+ "b 27f\n"
+ "26:" // Width 3: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c310 // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+ "addvl x24, x24, #12\n"
+ "27:" // Width 3: Output done
+ "b 36f\n"
+ "28:" // Width 4
+ "mov x19, #0x3\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 29f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6e8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042d01 // mova za.d[x9, #1], { z8.d-z11.d }\n"
+ ".inst 0xa042c6e4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042c82 // mova za.d[x9, #2], { z4.d-z7.d }\n"
+ ".inst 0xa043c6f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x23, x23, #16\n"
+ "b 30f\n"
+ "29:" // Width 4: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "30:" // Width 4: setup done
+ "cmp x20, #0x4\n"
+ "ble 32f\n"
+ "31:" // Width 4: Multiply loop: Main loop head
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "sub x20, x20, #0x4\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ "cmp x20, #0x4\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa041c725 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ ".inst 0xa042c735 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+ ".inst 0xa043c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aa203 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xa042c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa043c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aa603 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ ".inst 0xa042c73d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ ".inst 0xa043c735 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aaa83 // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa041c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa042c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa043c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "bgt 31b\n"
+ "32:" // Width 4: Multiply loop: Single iteration only
+ "whilelt p0.s, XZR, x20\n"
+ "ld1rqw { z10.s }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xa040c721 // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc15aa000 // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+ ".inst 0xa041c725 // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa081 // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+ ".inst 0xa042c735 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa282 // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+ ".inst 0xa043c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aa203 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa700 // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aa601 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+ ".inst 0xa042c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aa702 // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+ ".inst 0xa043c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aa603 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040c72d // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ ".inst 0xc15aa980 // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+ ".inst 0xa041c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaa01 // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+ ".inst 0xa042c73d // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aab82 // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+ ".inst 0xa043c735 // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aaa83 // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+ ".inst 0xc15aae00 // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa041c739 // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc15aaf01 // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+ ".inst 0xa042c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc15aae02 // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+ ".inst 0xa043c731 // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc15aae03 // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+ "addvl x25, x25, #16\n"
+ "33:" // Width 4: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 34f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z0.s }, p1/Z, [x20]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ "ld1rw { z6.s }, p1/Z, [x19]\n"
+ ".inst 0xc1a6c808 // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1a6c814 // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+ ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc1a6c810 // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+ ".inst 0xa062c710 // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc1a6c818 // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
+ ".inst 0xa063c318 // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+ "addvl x24, x24, #16\n"
+ "b 35f\n"
+ "34:" // Width 4: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c34 // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c714 // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0062c50 // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c710 // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0062c78 // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+ ".inst 0xa063c318 // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+ "addvl x24, x24, #16\n"
+ "35:" // Width 4: Output done
+ "subs x26, x26, #0x4\n"
+ "sub %x[N], %x[N], x27, LSL #2\n"
+ "bgt 4b\n"
+ "36:" // Exit
+ ".inst 0xd503467f // SMSTOP\n"
+ "ptrue p1.b\n"
+ : [N] "+&r" (N)
+ : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
new file mode 100644
index 0000000000..f52fbcd57f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST \
+ const float *, const bfloat16 *, \
+ float *, size_t, size_t, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp32bf16fp32_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp32bf16fp32_dot_16VL
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+
+ StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+ // Default to the generic kernel
+ kern_type kernel=sme2_gemv_fp32bf16fp32_dot_16VL;
+ cls_sme2_gemv_fp32bf16fp32_dot_16VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..8b8bcb6bc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp32bf16fp32_dot_16VL (
+ const float *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const bfloat16 *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cntw x28, ALL, MUL #4\n"
+ "add x27, %x[N], x28\n"
+ "sub x27, x27, #0x1\n"
+ "udiv x27, x27, x28\n"
+ "add x21, x27, #0x3\n"
+ "and x21, x21, #0xfffffffffffffffc\n"
+ "mul x21, x21, x28\n"
+ "mul x21, x21, %x[K]\n"
+ "mov x9, #0x0\n"
+ "mov x26, #0x4\n"
+ "mov x25, %x[B_ptr]\n"
+ "mov x24, %x[output_ptr]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "lsl x21, x21, #0x1\n"
+ "mov x20, #0x1\n"
+ "1:" // RHS size check loop
+ "cmp x21, #0x200000\n"
+ "blt 2f\n"
+ "tbnz x21, #0, 3f\n"
+ "lsr x21, x21, #0x1\n"
+ "lsl x20, x20, #0x1\n"
+ "b 1b\n"
+ "2:" // RHS do prefetch
+ "lsl x19, x21, #0x26\n"
+ "sub x20, x20, #0x1\n"
+ "lsl x20, x20, #0x16\n"
+ "orr x21, x21, x19\n"
+ "orr x21, x21, x20\n"
+ ".inst 0xf8b54b3a // rprfm pldonce, x21, [x25]\n"
+ "3:" // RHS prefetch exit
+ "mov x23, %x[bias]\n"
+ "4:" // Column loop
+ "cmp x27, #0x4\n"
+ "bge 28f\n"
+ "cmp x27, #0x2\n"
+ "bgt 20f\n"
+ "beq 12f\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "mov x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 5f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ "b 6f\n"
+ "5:" // Width 1: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "6:" // Width 1: setup done
+ "cmp x20, #0x8\n"
+ "ble 8f\n"
+ "7:" // Width 1: Multiply loop: Main loop head
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "sub x20, x20, #0x8\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "addvl x25, x25, #16\n"
+ "cmp x20, #0x8\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "addvl x25, x25, #16\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ "bgt 7b\n"
+ "8:" // Width 1: Multiply loop: Single iteration only
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "subs x20, x20, #0x2\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 9f\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "9:" // Width 1: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 10f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ "ld1rw { z18.s }, p2/Z, [x19]\n"
+ ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+ ".inst 0xa060c308 // st1w { z8.s-z11.s }, p8, [x24]\n"
+ "addvl x24, x24, #4\n"
+ "b 11f\n"
+ "10:" // Width 1: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c308 // st1w { z8.s-z11.s }, p8, [x24]\n"
+ "addvl x24, x24, #4\n"
+ "11:" // Width 1: Output done
+ "b 36f\n"
+ "12:" // Width 2
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "sub x19, %x[N], x28\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 13f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ "b 14f\n"
+ "13:" // Width 2: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "14:" // Width 2: setup done
+ "cmp x20, #0x8\n"
+ "ble 16f\n"
+ "15:" // Width 2: Multiply loop: Main loop head
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "sub x20, x20, #0x8\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "cmp x20, #0x8\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ "bgt 15b\n"
+ "16:" // Width 2: Multiply loop: Single iteration only
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "subs x20, x20, #0x2\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ "ble 17f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa041a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 17f\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "17:" // Width 2: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 18f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ "ld1rw { z18.s }, p2/Z, [x19]\n"
+ ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+ ".inst 0xa061c30c // st1w { z12.s-z15.s }, p8, [x24, #0x4, MUL VL]\n"
+ "addvl x24, x24, #8\n"
+ "b 19f\n"
+ "18:" // Width 2: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c30c // st1w { z12.s-z15.s }, p8, [x24, #0x4, MUL VL]\n"
+ "addvl x24, x24, #8\n"
+ "19:" // Width 2: Output done
+ "b 36f\n"
+ "20:" // Width 3
+ "mov x19, #0x2\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "msub x19, x28, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 21f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c6fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n"
+ "b 22f\n"
+ "21:" // Width 3: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "22:" // Width 3: setup done
+ "cmp x20, #0x8\n"
+ "ble 24f\n"
+ "23:" // Width 3: Multiply loop: Main loop head
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "sub x20, x20, #0x8\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "cmp x20, #0x8\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa042a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+ "bgt 23b\n"
+ "24:" // Width 3: Multiply loop: Single iteration only
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "subs x20, x20, #0x2\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+ "ble 25f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ ".inst 0xa042a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa041a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 25f\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "25:" // Width 3: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 26f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ "ld1rw { z18.s }, p2/Z, [x19]\n"
+ ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+ ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+ ".inst 0xa061c70c // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc1b2cba4 // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
+ ".inst 0xa062c304 // st1w { z4.s-z7.s }, p8, [x24, #0x8, MUL VL]\n"
+ "addvl x24, x24, #12\n"
+ "b 27f\n"
+ "26:" // Width 3: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c70c // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c304 // st1w { z4.s-z7.s }, p8, [x24, #0x8, MUL VL]\n"
+ "addvl x24, x24, #12\n"
+ "27:" // Width 3: Output done
+ "b 36f\n"
+ "28:" // Width 4
+ "mov x19, #0x3\n"
+ "mov x22, %x[A_ptr]\n"
+ "lsl x21, %x[K], #0x2\n"
+ "msub x19, x28, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ ".inst 0x25b367f0 // whilelt p8.s, XZR, x19, VLx4\n"
+ "cbz x23, 29f\n"
+ ".inst 0xa040c6e0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+ ".inst 0xc0042c00 // mova za.d[x9, #0], { z0.d-z3.d }\n"
+ ".inst 0xa041c6f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e01 // mova za.d[x9, #1], { z16.d-z19.d }\n"
+ ".inst 0xa042c6fc // ld1w { z28.s-z31.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042f82 // mova za.d[x9, #2], { z28.d-z31.d }\n"
+ ".inst 0xa043c6f0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x23, x23, #16\n"
+ "b 30f\n"
+ "29:" // Width 4: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "30:" // Width 4: setup done
+ "cmp x20, #0x8\n"
+ "ble 32f\n"
+ "31:" // Width 4: Multiply loop: Main loop head
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "sub x20, x20, #0x8\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "cmp x20, #0x8\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ ".inst 0xa043a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b39b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa042a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ ".inst 0xa043a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b79b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ ".inst 0xa043a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b99b // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xa043a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150bf9b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+ "bgt 31b\n"
+ "32:" // Width 4: Multiply loop: Single iteration only
+ "whilelt p1.s, XZR, x20\n"
+ "whilelt p0.s, x26, x20\n"
+ "ld1rqw { z0.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa800 // bfcvt z0.h, p2/M, z0.s\n"
+ "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+ ".inst 0x658aa96b // bfcvt z11.h, p2/M, z11.s\n"
+ "uzp1 z0.h, z0.h, z0.h\n"
+ "subs x20, x20, #0x2\n"
+ "uzp1 z11.h, z11.h, z11.h\n"
+ "trn1 z0.d, z0.d, z11.d\n"
+ ".inst 0xa040a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+ "add x22, x22, #0x20\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b098 // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b119 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+ ".inst 0xa043a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150b21a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xc150b39b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
+ "ble 33f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150b618 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+ ".inst 0xa041a725 // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150b499 // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+ ".inst 0xa042a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150b59a // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+ ".inst 0xa043a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150b79b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+ "subs x20, x20, #0x2\n"
+ ".inst 0xc150ba18 // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa041a735 // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150ba99 // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150ba1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+ ".inst 0xa043a72d // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150b99b // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 33f\n"
+ ".inst 0xa040a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+ ".inst 0xc150bf98 // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+ ".inst 0xa041a729 // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc150bd19 // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+ ".inst 0xa042a731 // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc150be1a // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+ ".inst 0xa043a73d // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc150bf9b // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+ "addvl x25, x25, #16\n"
+ "33:" // Width 4: Multiply loop: multiply skip
+ "tbz %x[flags], #1, 34f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ "ld1rw { z29.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ "ld1rw { z18.s }, p2/Z, [x19]\n"
+ ".inst 0xc1b2cba8 // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+ ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc1b2cbac // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+ ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n"
+ ".inst 0xa061c70c // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc1b2cba4 // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
+ ".inst 0xa062c704 // st1w { z4.s-z7.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc1b2cba0 // fclamp { z0.s-z3.s }, z29.s, z18.s\n"
+ ".inst 0xa063c300 // st1w { z0.s-z3.s }, p8, [x24, #0xc, MUL VL]\n"
+ "addvl x24, x24, #16\n"
+ "b 35f\n"
+ "34:" // Width 4: No activation
+ ".inst 0xc0062c08 // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+ ".inst 0xa060c708 // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xa061c70c // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc0062c44 // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+ ".inst 0xa062c704 // st1w { z4.s-z7.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+ ".inst 0xc0062c60 // mova { z0.d-z3.d }, za.d[x9, #3]\n"
+ ".inst 0xa063c300 // st1w { z0.s-z3.s }, p8, [x24, #0xc, MUL VL]\n"
+ "addvl x24, x24, #16\n"
+ "35:" // Width 4: Output done
+ "subs x27, x27, #0x4\n"
+ "sub %x[N], %x[N], x28, LSL #2\n"
+ "bgt 4b\n"
+ "36:" // Exit
+ ".inst 0xd503467f // SMSTOP\n"
+ "ptrue p2.b\n"
+ : [N] "+&r" (N)
+ : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
new file mode 100644
index 0000000000..4c9f9cff9a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST \
+ const int8_t *, const int8_t *, \
+ int8_t *, size_t, size_t, \
+ const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+void sme2_gemv_s8qa_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_s8qa_dot_16VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+
+ StdTransformsSME<operand_type, result_type, 1, 16, 4> transforms = {};
+
+
+ // Default to the generic kernel
+ kern_type kernel=sme2_gemv_s8qa_dot_16VL;
+ cls_sme2_gemv_s8qa_dot_16VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..348c709119
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sme2_gemv_s8qa_dot_16VL (
+ const int8_t *A_ptr, const int8_t *B_ptr, int8_t *output_ptr,
+ size_t N, size_t K,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ ARM_COMPUTE_UNUSED(col_base);
+
+ struct KernelArgs {
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cntw x27, ALL, MUL #4\n"
+ "add x26, %x[N], x27\n"
+ "sub x26, x26, #0x1\n"
+ "udiv x26, x26, x27\n"
+ "add x21, x26, #0x3\n"
+ "and x21, x21, #0xfffffffffffffffc\n"
+ "mul x21, x21, x27\n"
+ "mov x9, #0x0\n"
+ "mov x25, %x[B_ptr]\n"
+ "mov x24, %x[output_ptr]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "mul x21, x21, %x[K]\n"
+ "mov x20, #0x1\n"
+ "1:" // RHS size check loop
+ "cmp x21, #0x200000\n"
+ "blt 2f\n"
+ "tbnz x21, #0, 3f\n"
+ "lsr x21, x21, #0x1\n"
+ "lsl x20, x20, #0x1\n"
+ "b 1b\n"
+ "2:" // RHS do prefetch
+ "lsl x19, x21, #0x26\n"
+ "sub x20, x20, #0x1\n"
+ "lsl x20, x20, #0x16\n"
+ "orr x21, x21, x19\n"
+ "orr x21, x21, x20\n"
+ ".inst 0xf8b54b3a // rprfm pldonce, x21, [x25]\n"
+ "3:" // RHS prefetch exit
+ "mov x23, %x[col_bias]\n"
+ "mov z26.s, #0x0\n"
+ "mov z24.b, #0x1\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "4:" // Column loop
+ "cmp x26, #0x4\n"
+ "bge 34f\n"
+ "cmp x26, #0x2\n"
+ "bgt 24f\n"
+ "beq 14f\n"
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "mov x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 5f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ "b 6f\n"
+ "5:" // Width 1: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "6:" // Width 1: setup done
+ "cmp x20, #0x10\n"
+ "ble 9f\n"
+ "7:" // Width 1: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 8f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "8:" // Width 1: Multiply loop: unique 1: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 7b\n"
+ "9:" // Width 1: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 10f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 10f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 10f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "10:" // Width 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 11f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "11:" // Width 1: Multiply loop: unique 2: skip row sum
+ "tbnz %x[flags], #31, 12f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "12:" // Width 1: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x24]\n"
+ "addvl x24, x24, #1\n"
+ "13:" // Width 1: Output done
+ "b 44f\n"
+ "14:" // Width 2
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "sub x19, %x[N], x27\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 15f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ "b 16f\n"
+ "15:" // Width 2: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "16:" // Width 2: setup done
+ "cmp x20, #0x10\n"
+ "ble 19f\n"
+ "17:" // Width 2: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 18f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "18:" // Width 2: Multiply loop: unique 3: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 17b\n"
+ "19:" // Width 2: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 20f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 20f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 20f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "20:" // Width 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 21f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "21:" // Width 2: Multiply loop: unique 4: skip row sum
+ "tbnz %x[flags], #31, 22f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "22:" // Width 2: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p2, [x24]\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x24, #1, MUL VL]\n"
+ "addvl x24, x24, #2\n"
+ "23:" // Width 2: Output done
+ "b 44f\n"
+ "24:" // Width 3
+ "mov x19, #0x2\n"
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 25f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ ".inst 0xa042c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ "b 26f\n"
+ "25:" // Width 3: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "26:" // Width 3: setup done
+ "cmp x20, #0x10\n"
+ "ble 29f\n"
+ "27:" // Width 3: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 28f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "28:" // Width 3: Multiply loop: unique 5: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 27b\n"
+ "29:" // Width 3: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 30f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 30f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 30f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "30:" // Width 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 31f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "31:" // Width 3: Multiply loop: unique 6: skip row sum
+ "tbnz %x[flags], #31, 32f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "32:" // Width 3: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z1.h, z2.h, z3.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p2, [x24]\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+ "uzp1 z0.b, z0.b, z1.b\n"
+ "st1b { z0.b }, p1, [x24, #2, MUL VL]\n"
+ "addvl x24, x24, #3\n"
+ "33:" // Width 3: Output done
+ "b 44f\n"
+ "34:" // Width 4
+ "mov x19, #0x3\n"
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 35f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ ".inst 0xa042c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ ".inst 0xa043c2f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x23, x23, #16\n"
+ "b 36f\n"
+ "35:" // Width 4: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "36:" // Width 4: setup done
+ "cmp x20, #0x10\n"
+ "ble 39f\n"
+ "37:" // Width 4: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xa043833d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153bba3 // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xa0438331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 38f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "38:" // Width 4: Multiply loop: unique 7: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 37b\n"
+ "39:" // Width 4: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b220 // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b1a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 40f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6a0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b521 // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b622 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b5a3 // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 40f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153baa0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8a1 // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9a2 // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xa043833d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153bba3 // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 40f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bea0 // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153bea1 // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be22 // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xa0438331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153be23 // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "40:" // Width 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 41f\n"
+ "sdot z26.s, z3.b, z24.b\n"
+ "41:" // Width 4: Multiply loop: unique 8: skip row sum
+ "tbnz %x[flags], #31, 42f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "42:" // Width 4: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n"
+ ".inst 0xc1a5ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a6ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0cea8 // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z1.h, z2.h, z3.h\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p2, [x24]\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+ "uzp1 z0.b, z0.b, z1.b\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z0.b }, p2, [x24, #2, MUL VL]\n"
+ "st1b { z8.b }, p1, [x24, #3, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ "43:" // Width 4: Output done
+ "subs x26, x26, #0x4\n"
+ "sub %x[N], %x[N], x27, LSL #2\n"
+ "bgt 4b\n"
+ "44:" // Exit
+ ".inst 0xd503467f // SMSTOP\n"
+ "ptrue p2.b\n"
+ : [N] "+&r" (N), [flags] "+&r" (flags)
+ : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
new file mode 100644
index 0000000000..e15b95445e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST \
+ const uint8_t *, const uint8_t *, \
+ uint8_t *, size_t, size_t, \
+ const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+void sme2_gemv_u8qa_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_u8qa_dot_16VL
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint8_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<uint32_t>() * 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+
+ StdTransformsSME<operand_type, result_type, 1, 16, 4> transforms = {};
+
+
+ // Default to the generic kernel
+ kern_type kernel=sme2_gemv_u8qa_dot_16VL;
+ cls_sme2_gemv_u8qa_dot_16VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..9822f637fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sme2_gemv_u8qa_dot_16VL (
+ const uint8_t *A_ptr, const uint8_t *B_ptr, uint8_t *output_ptr,
+ size_t N, size_t K,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ ARM_COMPUTE_UNUSED(col_base);
+
+ struct KernelArgs {
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "cntw x27, ALL, MUL #4\n"
+ "add x26, %x[N], x27\n"
+ "sub x26, x26, #0x1\n"
+ "udiv x26, x26, x27\n"
+ "add x21, x26, #0x3\n"
+ "and x21, x21, #0xfffffffffffffffc\n"
+ "mul x21, x21, x27\n"
+ "mov x9, #0x0\n"
+ "mov x25, %x[B_ptr]\n"
+ "mov x24, %x[output_ptr]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "mul x21, x21, %x[K]\n"
+ "mov x20, #0x1\n"
+ "1:" // RHS size check loop
+ "cmp x21, #0x200000\n"
+ "blt 2f\n"
+ "tbnz x21, #0, 3f\n"
+ "lsr x21, x21, #0x1\n"
+ "lsl x20, x20, #0x1\n"
+ "b 1b\n"
+ "2:" // RHS do prefetch
+ "lsl x19, x21, #0x26\n"
+ "sub x20, x20, #0x1\n"
+ "lsl x20, x20, #0x16\n"
+ "orr x21, x21, x19\n"
+ "orr x21, x21, x20\n"
+ ".inst 0xf8b54b3a // rprfm pldonce, x21, [x25]\n"
+ "3:" // RHS prefetch exit
+ "mov x23, %x[col_bias]\n"
+ "mov z26.s, #0x0\n"
+ "mov z24.b, #0x1\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "4:" // Column loop
+ "cmp x26, #0x4\n"
+ "bge 34f\n"
+ "cmp x26, #0x2\n"
+ "bgt 24f\n"
+ "beq 14f\n"
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "mov x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 5f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ "b 6f\n"
+ "5:" // Width 1: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "6:" // Width 1: setup done
+ "cmp x20, #0x10\n"
+ "ble 9f\n"
+ "7:" // Width 1: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 8f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "8:" // Width 1: Multiply loop: unique 1: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 7b\n"
+ "9:" // Width 1: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 10f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 10f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 10f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "10:" // Width 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 11f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "11:" // Width 1: Multiply loop: unique 2: skip row sum
+ "tbnz %x[flags], #31, 12f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "12:" // Width 1: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x24]\n"
+ "addvl x24, x24, #1\n"
+ "13:" // Width 1: Output done
+ "b 44f\n"
+ "14:" // Width 2
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "sub x19, %x[N], x27\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 15f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ "b 16f\n"
+ "15:" // Width 2: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "16:" // Width 2: setup done
+ "cmp x20, #0x10\n"
+ "ble 19f\n"
+ "17:" // Width 2: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 18f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "18:" // Width 2: Multiply loop: unique 3: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 17b\n"
+ "19:" // Width 2: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 20f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 20f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 20f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "20:" // Width 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 21f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "21:" // Width 2: Multiply loop: unique 4: skip row sum
+ "tbnz %x[flags], #31, 22f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "22:" // Width 2: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p2, [x24]\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x24, #1, MUL VL]\n"
+ "addvl x24, x24, #2\n"
+ "23:" // Width 2: Output done
+ "b 44f\n"
+ "24:" // Width 3
+ "mov x19, #0x2\n"
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 25f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ ".inst 0xa042c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ "b 26f\n"
+ "25:" // Width 3: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "26:" // Width 3: setup done
+ "cmp x20, #0x10\n"
+ "ble 29f\n"
+ "27:" // Width 3: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 28f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "28:" // Width 3: Multiply loop: unique 5: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 27b\n"
+ "29:" // Width 3: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 30f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 30f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 30f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "30:" // Width 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 31f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "31:" // Width 3: Multiply loop: unique 6: skip row sum
+ "tbnz %x[flags], #31, 32f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "32:" // Width 3: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z1.h, z2.h, z3.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p2, [x24]\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+ "uzp1 z0.b, z0.b, z1.b\n"
+ "st1b { z0.b }, p1, [x24, #2, MUL VL]\n"
+ "addvl x24, x24, #3\n"
+ "33:" // Width 3: Output done
+ "b 44f\n"
+ "34:" // Width 4
+ "mov x19, #0x3\n"
+ "mov x22, %x[A_ptr]\n"
+ "mov x21, %x[K]\n"
+ "msub x19, x27, x19, %x[N]\n"
+ "mov x20, %x[K]\n"
+ ".inst 0xf8b54ad8 // rprfm pldmany, x21, [x22]\n"
+ "whilelt p1.b, XZR, x19\n"
+ "cbz x23, 35f\n"
+ ".inst 0xa040c2e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+ ".inst 0xc0042c80 // mova za.d[x9, #0], { z4.d-z7.d }\n"
+ ".inst 0xa041c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc0042e81 // mova za.d[x9, #1], { z20.d-z23.d }\n"
+ ".inst 0xa042c2f4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+ ".inst 0xc0042e82 // mova za.d[x9, #2], { z20.d-z23.d }\n"
+ ".inst 0xa043c2f0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x23, #0xc, MUL VL]\n"
+ ".inst 0xc0042e03 // mova za.d[x9, #3], { z16.d-z19.d }\n"
+ "addvl x23, x23, #16\n"
+ "b 36f\n"
+ "35:" // Width 4: no bias
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "36:" // Width 4: setup done
+ "cmp x20, #0x10\n"
+ "ble 39f\n"
+ "37:" // Width 4: Multiply loop: Main loop head
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xa043833d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153bbb3 // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xa0438331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153be33 // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "tbnz %x[flags], #31, 38f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "38:" // Width 4: Multiply loop: unique 7: skip row sum
+ "sub x20, x20, #0x10\n"
+ "cmp x20, #0x10\n"
+ "bgt 37b\n"
+ "39:" // Width 4: Multiply loop: Single iteration only
+ "whilelt p0.b, XZR, x20\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xa0408331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0xc153b230 // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b0b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b1b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b1b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+ "addvl x25, x25, #16\n"
+ "ble 40f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153b6b0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+ ".inst 0xa0418329 // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b531 // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b632 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+ ".inst 0xa043832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153b5b3 // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+ "addvl x25, x25, #16\n"
+ "ble 40f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ "subs x20, x20, #0x4\n"
+ ".inst 0xc153bab0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+ ".inst 0xa0418325 // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153b8b1 // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+ ".inst 0xa042832d // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153b9b2 // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+ ".inst 0xa043833d // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153bbb3 // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+ "addvl x25, x25, #16\n"
+ "ble 40f\n"
+ ".inst 0xa0408335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+ ".inst 0xc153beb0 // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0418335 // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+ ".inst 0xc153beb1 // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+ ".inst 0xa0428331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+ ".inst 0xc153be32 // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+ ".inst 0xa0438331 // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+ ".inst 0xc153be33 // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+ "addvl x25, x25, #16\n"
+ "40:" // Width 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 41f\n"
+ "udot z26.s, z3.b, z24.b\n"
+ "41:" // Width 4: Multiply loop: unique 8: skip row sum
+ "tbnz %x[flags], #31, 42f\n"
+ "add x20, %x[qp], %[b_offset]\n"
+ "mov x19, #0x4\n"
+ "ld1rw { z10.s }, p2/Z, [x20]\n"
+ "neg z10.s, p2/M, z10.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d26, p0, z26.s\n"
+ "mov z26.s, z26.s[0]\n"
+ "mul z26.s, p2/M, z26.s, z10.s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "42:" // Width 4: skip row sum fixup
+ ".inst 0xc0904b40 // addha za0.s, p2/M, p2/M, z26.s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ ".inst 0xc0904b41 // addha za1.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "add x20, %x[qp], %[minval]\n"
+ ".inst 0xc0904b42 // addha za2.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0xc0904b43 // addha za3.s, p2/M, p2/M, z26.s\n"
+ "ld1rw { z21.s }, p2/Z, [x20]\n"
+ ".inst 0xc0062c1c // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+ ".inst 0xc1a5ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+ ".inst 0xc0062c2c // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+ ".inst 0xc1a5ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+ ".inst 0xc0062c40 // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+ ".inst 0xc1a5ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+ ".inst 0xc0062c68 // mova { z8.d-z11.d }, za.d[x9, #3]\n"
+ ".inst 0xc1a5ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ ".inst 0xc1a4aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+ ".inst 0xc1a4aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a6ab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
+ ".inst 0xc1b0cebc // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0ceac // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ ".inst 0xc1b0cea0 // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+ ".inst 0xc1b0cea8 // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z0.h, z0.h, z1.h\n"
+ "uzp1 z1.h, z2.h, z3.h\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p2, [x24]\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+ "uzp1 z0.b, z0.b, z1.b\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z0.b }, p2, [x24, #2, MUL VL]\n"
+ "st1b { z8.b }, p1, [x24, #3, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ "43:" // Width 4: Output done
+ "subs x26, x26, #0x4\n"
+ "sub %x[N], %x[N], x27, LSL #2\n"
+ "bgt 4b\n"
+ "44:" // Exit
+ ".inst 0xd503467f // SMSTOP\n"
+ "ptrue p2.b\n"
+ : [N] "+&r" (N), [flags] "+&r" (flags)
+ : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..37eb63d898
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<float>() * 1;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL;
+
+ StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
+
+ cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..bb8cad3357
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const bfloat16 *const A,
+ const bfloat16 *const B,
+ float *const C, const int ldc,
+ const int M, const int N, const int K,
+ const float *const bias,
+ const Activation act,
+ bool accumulate,
+ float *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+ C(C), ldcb(ldc * sizeof(float)),
+ M(M), N(N), K(K),
+ n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+ min(-std::numeric_limits<float>::infinity()),
+ max(std::numeric_limits<float>::infinity()),
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (act.type == Activation::Type::None)
+ {
+ flags |= 1 << 2; // SKIP_ACTIVATION
+ }
+
+ // Initialise the activation values
+ switch (act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ this->max = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ this->min = static_cast<float>(0);
+ break;
+ }
+ }
+
+ const bfloat16 *const A;
+ const bfloat16 *const B;
+ const long kstride_bytes;
+ float *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ float min = -std::numeric_limits<float>::infinity();
+ float max = std::numeric_limits<float>::infinity();
+
+ const float *const bias;
+
+ float *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x14, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p0.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x14, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x13, x13, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w10, [%x[args], %[offsetof_M]]\n"
+ "mov x9, #0x0\n"
+ "mov x28, #0x0\n"
+ "ldr w27, [%x[args], %[offsetof_N]]\n"
+ "ldr x26, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x25, x26\n"
+ ".inst 0x25bb6790 // whilelt pn8.s, x28, x27, VLx4\n"
+ "tbnz x14, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "fmov z21.s, #1.0\n"
+ ".inst 0xa01cc27d // ldnt1w { z28.s-z31.s }, p8/Z, [x19, x28, LSL #2]\n"
+ ".inst 0x809c02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
+ ".inst 0x809d02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
+ ".inst 0x809e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
+ ".inst 0x809f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x28\n"
+ "mov x20, x9\n"
+ "incw x19, ALL, MUL #4\n"
+ "incw x20\n"
+ "cmp x19, x27\n"
+ "csel x20, x9, x20, LT\n"
+ "mov x19, x14\n"
+ "bfm x14, XZR, #0x0, #0x0 // bfc x14, #0x0, #0x1\n"
+ "cmp x20, x10\n"
+ "csel x14, x19, x14, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x1\n"
+ "lsr x19, x19, #0x1\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x28, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ "ld1h { z0.h }, p0/Z, [x25]\n"
+ ".inst 0xa140a6db // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x22]\n"
+ "ld1h { z13.h }, p0/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa141a6ca // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ "ld1h { z12.h }, p0/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa142a6cb // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ "ld1h { z26.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa143a6d8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
+ ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
+ ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+ "ld1h { z0.h }, p0/Z, [x25]\n"
+ ".inst 0x818201a0 // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
+ ".inst 0xa140a6db // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x22]\n"
+ ".inst 0x818601a1 // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
+ ".inst 0x818a01a2 // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
+ ".inst 0x818e01a3 // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
+ "ld1h { z13.h }, p0/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x81830180 // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
+ ".inst 0xa141a6ca // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0x81870181 // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
+ ".inst 0x818b0182 // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
+ ".inst 0x818f0183 // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
+ "ld1h { z12.h }, p0/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa142a6cb // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ ".inst 0x81900340 // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
+ ".inst 0x81940341 // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
+ ".inst 0x81980342 // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
+ ".inst 0x819c0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
+ "ld1h { z26.h }, p0/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa143a6d8 // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+ ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
+ ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
+ ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+ ".inst 0x818201a0 // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
+ ".inst 0x818601a1 // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
+ ".inst 0x818a01a2 // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
+ ".inst 0x818e01a3 // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
+ ".inst 0x81830180 // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
+ ".inst 0x81870181 // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
+ ".inst 0x818b0182 // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
+ ".inst 0x818f0183 // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
+ ".inst 0x81900340 // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
+ ".inst 0x81940341 // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
+ ".inst 0x81980342 // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
+ ".inst 0x819c0343 // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ "ld1h { z0.h }, p0/Z, [x25]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x25, x25, #1\n"
+ ".inst 0xa140a6d3 // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0x81930000 // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+ ".inst 0x81970001 // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
+ ".inst 0x819b0002 // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
+ ".inst 0x819f0003 // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x14, #1, 14f\n"
+ "tbz x14, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
+ "addvl x13, x13, #16\n"
+ ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa062c57c // st1w { z28.s-z31.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "blt 11b\n"
+ "b 24f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa060c56c // st1w { z12.s-z15.s }, pn9.b, [x11]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c57c // st1w { z28.s-z31.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "blt 13b\n"
+ "b 24f\n"
+ "14:" // Store to output array
+ "ldr x24, [%x[args], %[offsetof_C]]\n"
+ "add x24, x24, x28, LSL #2\n" // C += n
+ "sub x23, x10, x9\n"
+ "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x24, x9, x22, x24\n" // C += m * ldc
+ "tbz x14, #2, 18f\n"
+ "cntw x19\n"
+ "cmp x23, x19\n"
+ "csel x21, x23, x19, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Skip activation: Accumulator row 0 loop
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa160c303 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Skip activation: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 17f\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
+ "subs x23, x23, x21\n"
+ "beq 18f\n"
+ "b 22f\n"
+ "18:" // Store to output array: Skip activation: End
+ "cntw x19\n"
+ "cmp x23, x19\n"
+ "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "csel x19, x23, x19, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 20f\n"
+ "19:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ ".inst 0xa160c303 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "blt 19b\n"
+ "20:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 21f\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 21f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 21f\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "21:" // Store to output array: Accumulator row 0 oddments: End
+ "22:" // Store to output array: End
+ "tbz x14, #0, 24f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "23:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x13, x13, #16\n"
+ "blt 23b\n"
+ "24:" // End block
+ "incw x28, ALL, MUL #4\n"
+ "cmp x28, x27\n"
+ "blt 3b\n"
+ "incw x9\n"
+ "cmp x9, x10\n"
+ "mov x28, #0x0\n"
+ "mov x26, x25\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..89c79cfb0a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<float>() * 2;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 2;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL;
+
+ StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
+
+ cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..a4a40ad5ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const bfloat16 *const A,
+ const bfloat16 *const B,
+ float *const C, const int ldc,
+ const int M, const int N, const int K,
+ const float *const bias,
+ const Activation act,
+ bool accumulate,
+ float *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+ C(C), ldcb(ldc * sizeof(float)),
+ M(M), N(N), K(K),
+ n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+ min(-std::numeric_limits<float>::infinity()),
+ max(std::numeric_limits<float>::infinity()),
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (act.type == Activation::Type::None)
+ {
+ flags |= 1 << 2; // SKIP_ACTIVATION
+ }
+
+ // Initialise the activation values
+ switch (act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ this->max = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ this->min = static_cast<float>(0);
+ break;
+ }
+ }
+
+ const bfloat16 *const A;
+ const bfloat16 *const B;
+ const long kstride_bytes;
+ float *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ float min = -std::numeric_limits<float>::infinity();
+ float max = std::numeric_limits<float>::infinity();
+
+ const float *const bias;
+
+ float *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p0.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ ".inst 0x25bc4530 // whilelt pn8.s, x9, x28, VLx2\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "fmov z21.s, #1.0\n"
+ ".inst 0xa009426f // ldnt1w { z14.s-z15.s }, p8/Z, [x19, x9, LSL #2]\n"
+ ".inst 0x808e02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
+ ".inst 0x808f02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
+ ".inst 0x808e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
+ ".inst 0x808f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19, ALL, MUL #2\n"
+ "incw x20, ALL, MUL #2\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x1\n"
+ "lsr x19, x19, #0x1\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1402747 // ld1h { z7.h, z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0xa14026df // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x22]\n"
+ ".inst 0xa0412748 // ld1h { z8.h-z9.h }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa04126c3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422752 // ld1h { z18.h, z26.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa04226d1 // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa1432756 // ld1h { z22.h, z30.h }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14326cc // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
+ ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
+ ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+ ".inst 0xa1402747 // ld1h { z7.h, z15.h }, pn9.b/Z, [x26]\n"
+ ".inst 0x81820100 // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
+ ".inst 0xa14026df // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x22]\n"
+ ".inst 0x81830101 // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
+ ".inst 0x81820122 // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
+ ".inst 0x81830123 // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
+ ".inst 0xa0412748 // ld1h { z8.h-z9.h }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0x81900240 // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
+ ".inst 0xa04126c3 // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0x81910241 // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
+ ".inst 0x81900342 // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
+ ".inst 0x81910343 // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
+ ".inst 0xa1422752 // ld1h { z18.h, z26.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa04226d1 // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0x818402c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
+ ".inst 0x818c02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
+ ".inst 0x818403c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
+ ".inst 0x818c03c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
+ ".inst 0xa1432756 // ld1h { z22.h, z30.h }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14326cc // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+ ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
+ ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
+ ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+ ".inst 0x81820100 // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
+ ".inst 0x81830101 // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
+ ".inst 0x81820122 // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
+ ".inst 0x81830123 // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
+ ".inst 0x81900240 // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
+ ".inst 0x81910241 // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
+ ".inst 0x81900342 // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
+ ".inst 0x81910343 // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
+ ".inst 0x818402c0 // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
+ ".inst 0x818c02c1 // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
+ ".inst 0x818403c2 // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
+ ".inst 0x818c03c3 // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa1402747 // ld1h { z7.h, z15.h }, pn9.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xa14026d7 // ld1h { z23.h, z31.h }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #2\n"
+ ".inst 0x819700e0 // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+ ".inst 0x819f00e1 // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
+ ".inst 0x819701e2 // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
+ ".inst 0x819f01e3 // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 30f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 30f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
+ "sub x24, x11, x10\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "tbz x15, #2, 21f\n"
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Skip activation: Accumulator row 0 loop
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604327 // st1w { z7.s, z15.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Skip activation: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 21f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 19f\n"
+ "18:" // Store to output array: Skip activation: Accumulator row 1 loop
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604327 // st1w { z7.s, z15.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Skip activation: Accumulator row 1 oddments
+ "cbz x19, 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa1604334 // st1w { z20.s, z28.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604335 // st1w { z21.s, z29.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ ".inst 0xa1604336 // st1w { z22.s, z30.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 21f\n"
+ "b 28f\n"
+ "21:" // Store to output array: Skip activation: End
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 23f\n"
+ "22:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604327 // st1w { z7.s, z15.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 22b\n"
+ "23:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 24f\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604320 // st1w { z0.s, z8.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604321 // st1w { z1.s, z9.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ ".inst 0xa1604322 // st1w { z2.s, z10.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "24:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 26f\n"
+ "25:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xa1604330 // st1w { z16.s, z24.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604331 // st1w { z17.s, z25.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604332 // st1w { z18.s, z26.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604333 // st1w { z19.s, z27.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 25b\n"
+ "26:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 27f\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604330 // st1w { z16.s, z24.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604331 // st1w { z17.s, z25.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ ".inst 0xa1604332 // st1w { z18.s, z26.s }, p8, [x25]\n"
+ "27:" // Store to output array: Accumulator row 1 oddments: End
+ "28:" // Store to output array: End
+ "tbz x15, #0, 30f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "29:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 29b\n"
+ "30:" // End block
+ "incw x9, ALL, MUL #2\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #2\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..0d407e0cba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<float>() * 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 1;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL;
+
+ StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
+
+ cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..798a3cb470
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const bfloat16 *const A,
+ const bfloat16 *const B,
+ float *const C, const int ldc,
+ const int M, const int N, const int K,
+ const float *const bias,
+ const Activation act,
+ bool accumulate,
+ float *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+ C(C), ldcb(ldc * sizeof(float)),
+ M(M), N(N), K(K),
+ n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+ min(-std::numeric_limits<float>::infinity()),
+ max(std::numeric_limits<float>::infinity()),
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (act.type == Activation::Type::None)
+ {
+ flags |= 1 << 2; // SKIP_ACTIVATION
+ }
+
+ // Initialise the activation values
+ switch (act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ this->max = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ this->min = static_cast<float>(0);
+ break;
+ }
+ }
+
+ const bfloat16 *const A;
+ const bfloat16 *const B;
+ const long kstride_bytes;
+ float *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ float min = -std::numeric_limits<float>::infinity();
+ float max = std::numeric_limits<float>::infinity();
+
+ const float *const bias;
+
+ float *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ "whilelt p0.s, x9, x28\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "fmov z8.s, #1.0\n"
+ "ldnt1w { z27.s }, p0/Z, [x19, x9, LSL #2]\n"
+ ".inst 0x809b2500 // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
+ ".inst 0x809b2501 // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
+ ".inst 0x809b2502 // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
+ ".inst 0x809b2503 // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19\n"
+ "incw x20, ALL, MUL #4\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x1\n"
+ "lsr x19, x19, #0x1\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa040a344 // ld1h { z4.h-z7.h }, pn8.b/Z, [x26]\n"
+ "ldnt1h { z29.h }, p1/Z, [x22]\n"
+ ".inst 0xa041a34c // ld1h { z12.h-z15.h }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ "ldnt1h { z23.h }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa042a340 // ld1h { z0.h-z3.h }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1h { z21.h }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa143a352 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
+ ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
+ ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+ ".inst 0xa040a344 // ld1h { z4.h-z7.h }, pn8.b/Z, [x26]\n"
+ ".inst 0x81972580 // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
+ "ldnt1h { z29.h }, p1/Z, [x22]\n"
+ ".inst 0x819725a1 // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
+ ".inst 0x819725c2 // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
+ ".inst 0x819725e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
+ ".inst 0xa041a34c // ld1h { z12.h-z15.h }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0x81952400 // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
+ "ldnt1h { z23.h }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0x81952421 // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
+ ".inst 0x81952442 // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
+ ".inst 0x81952463 // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
+ ".inst 0xa042a340 // ld1h { z0.h-z3.h }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1h { z21.h }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0x819b2640 // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
+ ".inst 0x819b26c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
+ ".inst 0x819b2742 // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
+ ".inst 0x819b27c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
+ ".inst 0xa143a352 // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+ ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
+ ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
+ ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+ ".inst 0x81972580 // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
+ ".inst 0x819725a1 // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
+ ".inst 0x819725c2 // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
+ ".inst 0x819725e3 // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
+ ".inst 0x81952400 // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
+ ".inst 0x81952421 // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
+ ".inst 0x81952442 // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
+ ".inst 0x81952463 // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
+ ".inst 0x819b2640 // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
+ ".inst 0x819b26c1 // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
+ ".inst 0x819b2742 // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
+ ".inst 0x819b27c3 // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa040a344 // ld1h { z4.h-z7.h }, pn8.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #4\n"
+ "ld1h { z29.h }, p1/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ ".inst 0x819d2480 // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+ ".inst 0x819d24a1 // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
+ ".inst 0x819d24c2 // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
+ ".inst 0x819d24e3 // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c1c8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1c8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c1a4 // st1w { z4.s-z7.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1a0 // st1w { z0.s-z3.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 42f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1ac // st1w { z12.s-z15.s }, pn8.b, [x13]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c1a0 // st1w { z0.s-z3.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 42f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
+ "sub x24, x11, x10\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "tbz x15, #2, 27f\n"
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Skip activation: Accumulator row 0 loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Skip activation: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z5.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ "st1w { z6.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 19f\n"
+ "18:" // Store to output array: Skip activation: Accumulator row 1 loop
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Skip activation: Accumulator row 1 oddments
+ "cbz x19, 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ "st1w { z4.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z5.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ "st1w { z6.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 22f\n"
+ "21:" // Store to output array: Skip activation: Accumulator row 2 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 21b\n"
+ "22:" // Store to output array: Skip activation: Accumulator row 2 oddments
+ "cbz x19, 23f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 23f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 23f\n"
+ "st1w { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 25f\n"
+ "24:" // Store to output array: Skip activation: Accumulator row 3 loop
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ "st1w { z4.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z5.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z7.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 24b\n"
+ "25:" // Store to output array: Skip activation: Accumulator row 3 oddments
+ "cbz x19, 26f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ "st1w { z12.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 26f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z13.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 26f\n"
+ "st1w { z14.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "26:" // Store to output array: Skip activation: Accumulator row 3 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "b 40f\n"
+ "27:" // Store to output array: Skip activation: End
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 29f\n"
+ "28:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z23.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 28b\n"
+ "29:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 30f\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb28 // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ "st1w { z8.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 30f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z9.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 30f\n"
+ "st1w { z10.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "30:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 40f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 32f\n"
+ "31:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 31b\n"
+ "32:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 33f\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 33f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 33f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "33:" // Store to output array: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 40f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 35f\n"
+ "34:" // Store to output array: Accumulator row 2 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 34b\n"
+ "35:" // Store to output array: Accumulator row 2 oddments
+ "cbz x19, 36f\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 36f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 36f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "36:" // Store to output array: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 40f\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 38f\n"
+ "37:" // Store to output array: Accumulator row 3 loop
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z23.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 37b\n"
+ "38:" // Store to output array: Accumulator row 3 oddments
+ "cbz x19, 39f\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 39f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 39f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "39:" // Store to output array: Accumulator row 3 oddments: End
+ "40:" // Store to output array: End
+ "tbz x15, #0, 42f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "41:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 41b\n"
+ "42:" // End block
+ "incw x9\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #4\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..7777349b42
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<float>() * 1;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_1VLx4VL;
+
+ StdTransformsSME<operand_type, result_type, 1, 4, 1> transforms = {};
+
+ cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..4f6d9a3d98
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const float *const A,
+ const float *const B,
+ float *const C, const int ldc,
+ const int M, const int N, const int K,
+ const float *const bias,
+ const Activation act,
+ bool accumulate,
+ float *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(K * sizeof(float)),
+ C(C), ldcb(ldc * sizeof(float)),
+ M(M), N(N), K(K),
+ n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+ min(-std::numeric_limits<float>::infinity()),
+ max(std::numeric_limits<float>::infinity()),
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (act.type == Activation::Type::None)
+ {
+ flags |= 1 << 2; // SKIP_ACTIVATION
+ }
+
+ // Initialise the activation values
+ switch (act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ this->max = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ this->min = static_cast<float>(0);
+ break;
+ }
+ }
+
+ const float *const A;
+ const float *const B;
+ const long kstride_bytes;
+ float *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ float min = -std::numeric_limits<float>::infinity();
+ float max = std::numeric_limits<float>::infinity();
+
+ const float *const bias;
+
+ float *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x14, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p0.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x14, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c5a4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x13, x13, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w10, [%x[args], %[offsetof_M]]\n"
+ "mov x9, #0x0\n"
+ "mov x28, #0x0\n"
+ "ldr w27, [%x[args], %[offsetof_N]]\n"
+ "ldr x26, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x25, x26\n"
+ ".inst 0x25bb6790 // whilelt pn8.s, x28, x27, VLx4\n"
+ "tbnz x14, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "fmov z21.s, #1.0\n"
+ ".inst 0xa01cc27d // ldnt1w { z28.s-z31.s }, p8/Z, [x19, x28, LSL #2]\n"
+ ".inst 0x809c02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
+ ".inst 0x809d02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
+ ".inst 0x809e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
+ ".inst 0x809f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x28\n"
+ "mov x20, x9\n"
+ "incw x19, ALL, MUL #4\n"
+ "incw x20\n"
+ "cmp x19, x27\n"
+ "csel x20, x9, x20, LT\n"
+ "mov x19, x14\n"
+ "bfm x14, XZR, #0x0, #0x0 // bfc x14, #0x0, #0x1\n"
+ "cmp x20, x10\n"
+ "csel x14, x19, x14, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "lsr x22, x19, #0x2\n"
+ "and x21, x19, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_B]]\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x20, x28, x19, x20\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ "ld1w { z0.s }, p0/Z, [x25]\n"
+ ".inst 0xa140c69b // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x20]\n"
+ "ld1w { z13.s }, p0/Z, [x25, #1, MUL VL]\n"
+ ".inst 0xa141c68a // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+ "ld1w { z12.s }, p0/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa142c68b // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x20, #0x8, MUL VL]\n"
+ "ld1w { z26.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa143c698 // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x20, #0xc, MUL VL]\n"
+ "addvl x20, x20, #16\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
+ ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
+ ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+ "ld1w { z0.s }, p0/Z, [x25]\n"
+ ".inst 0x808201a0 // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
+ ".inst 0xa140c69b // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x20]\n"
+ ".inst 0x808601a1 // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
+ ".inst 0x808a01a2 // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
+ ".inst 0x808e01a3 // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
+ "ld1w { z13.s }, p0/Z, [x25, #1, MUL VL]\n"
+ ".inst 0x80830180 // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
+ ".inst 0xa141c68a // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0x80870181 // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
+ ".inst 0x808b0182 // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
+ ".inst 0x808f0183 // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
+ "ld1w { z12.s }, p0/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xa142c68b // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x20, #0x8, MUL VL]\n"
+ ".inst 0x80900340 // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
+ ".inst 0x80940341 // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
+ ".inst 0x80980342 // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
+ ".inst 0x809c0343 // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
+ "ld1w { z26.s }, p0/Z, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ ".inst 0xa143c698 // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x20, #0xc, MUL VL]\n"
+ "addvl x20, x20, #16\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+ ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
+ ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
+ ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+ ".inst 0x808201a0 // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
+ ".inst 0x808601a1 // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
+ ".inst 0x808a01a2 // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
+ ".inst 0x808e01a3 // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
+ ".inst 0x80830180 // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
+ ".inst 0x80870181 // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
+ ".inst 0x808b0182 // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
+ ".inst 0x808f0183 // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
+ ".inst 0x80900340 // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
+ ".inst 0x80940341 // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
+ ".inst 0x80980342 // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
+ ".inst 0x809c0343 // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
+ "8:" // K oddments
+ "cbz x21, 10f\n"
+ "9:" // K oddments: Loop
+ "ld1w { z0.s }, p0/Z, [x25]\n"
+ "subs x21, x21, #0x1\n"
+ "addvl x25, x25, #1\n"
+ ".inst 0xa140c693 // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x20]\n"
+ "addvl x20, x20, #4\n"
+ ".inst 0x80930000 // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+ ".inst 0x80970001 // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
+ ".inst 0x809b0002 // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
+ ".inst 0x809f0003 // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x14, #1, 14f\n"
+ "tbz x14, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5bc // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5ac // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c578 // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
+ "addvl x13, x13, #16\n"
+ ".inst 0xa061c574 // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ ".inst 0xa062c57c // st1w { z28.s-z31.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "blt 11b\n"
+ "b 24f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xa060c56c // st1w { z12.s-z15.s }, pn9.b, [x11]\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c57c // st1w { z28.s-z31.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c570 // st1w { z16.s-z19.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+ ".inst 0xa063c564 // st1w { z4.s-z7.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "blt 13b\n"
+ "b 24f\n"
+ "14:" // Store to output array
+ "ldr x24, [%x[args], %[offsetof_C]]\n"
+ "add x24, x24, x28, LSL #2\n" // C += n
+ "sub x23, x10, x9\n"
+ "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x24, x9, x22, x24\n" // C += m * ldc
+ "tbz x14, #2, 18f\n"
+ "cntw x19\n"
+ "cmp x23, x19\n"
+ "csel x21, x23, x19, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Skip activation: Accumulator row 0 loop
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa160c303 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Skip activation: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 17f\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
+ "subs x23, x23, x21\n"
+ "beq 18f\n"
+ "b 22f\n"
+ "18:" // Store to output array: Skip activation: End
+ "cntw x19\n"
+ "cmp x23, x19\n"
+ "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "csel x19, x23, x19, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 20f\n"
+ "19:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ ".inst 0xa160c303 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "blt 19b\n"
+ "20:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 21f\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc1b0cae0 // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0cae4 // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xc1b0cae8 // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+ ".inst 0xc1b0caec // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c300 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 21f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c301 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+ "add x24, x24, x22\n"
+ "beq 21f\n"
+ ".inst 0xa160c302 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+ "21:" // Store to output array: Accumulator row 0 oddments: End
+ "22:" // Store to output array: End
+ "tbz x14, #0, 24f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "23:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5b0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5a8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x13, x13, #16\n"
+ "blt 23b\n"
+ "24:" // End block
+ "incw x28, ALL, MUL #4\n"
+ "cmp x28, x27\n"
+ "blt 3b\n"
+ "incw x9\n"
+ "cmp x9, x10\n"
+ "mov x28, #0x0\n"
+ "mov x26, x25\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..51e8c43335
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<float>() * 2;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 2;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_2VLx2VL;
+
+ StdTransformsSME<operand_type, result_type, 2, 2, 1> transforms = {};
+
+ cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..344215bfa5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const float *const A,
+ const float *const B,
+ float *const C, const int ldc,
+ const int M, const int N, const int K,
+ const float *const bias,
+ const Activation act,
+ bool accumulate,
+ float *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(K * sizeof(float)),
+ C(C), ldcb(ldc * sizeof(float)),
+ M(M), N(N), K(K),
+ n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+ min(-std::numeric_limits<float>::infinity()),
+ max(std::numeric_limits<float>::infinity()),
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (act.type == Activation::Type::None)
+ {
+ flags |= 1 << 2; // SKIP_ACTIVATION
+ }
+
+ // Initialise the activation values
+ switch (act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ this->max = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ this->min = static_cast<float>(0);
+ break;
+ }
+ }
+
+ const float *const A;
+ const float *const B;
+ const long kstride_bytes;
+ float *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ float min = -std::numeric_limits<float>::infinity();
+ float max = std::numeric_limits<float>::infinity();
+
+ const float *const bias;
+
+ float *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p0.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ ".inst 0x25bc4530 // whilelt pn8.s, x9, x28, VLx2\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "fmov z21.s, #1.0\n"
+ ".inst 0xa009426f // ldnt1w { z14.s-z15.s }, p8/Z, [x19, x9, LSL #2]\n"
+ ".inst 0x808e02a0 // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
+ ".inst 0x808f02a1 // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
+ ".inst 0x808e02a2 // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
+ ".inst 0x808f02a3 // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19, ALL, MUL #2\n"
+ "incw x20, ALL, MUL #2\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "lsr x22, x19, #0x2\n"
+ "and x21, x19, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_B]]\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x20, x9, x19, x20\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa1404747 // ld1w { z7.s, z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0xa140469f // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x20]\n"
+ ".inst 0xa0414748 // ld1w { z8.s-z9.s }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa0414683 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa1424752 // ld1w { z18.s, z26.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa0424691 // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xa1434756 // ld1w { z22.s, z30.s }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa143468c // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x20, #0x6, MUL VL]\n"
+ "addvl x20, x20, #8\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
+ ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
+ ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+ ".inst 0xa1404747 // ld1w { z7.s, z15.s }, pn9.b/Z, [x26]\n"
+ ".inst 0x80820100 // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
+ ".inst 0xa140469f // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x20]\n"
+ ".inst 0x80830101 // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
+ ".inst 0x80820122 // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
+ ".inst 0x80830123 // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
+ ".inst 0xa0414748 // ld1w { z8.s-z9.s }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0x80900240 // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
+ ".inst 0xa0414683 // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0x80910241 // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
+ ".inst 0x80900342 // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
+ ".inst 0x80910343 // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
+ ".inst 0xa1424752 // ld1w { z18.s, z26.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa0424691 // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0x808402c0 // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
+ ".inst 0x808c02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
+ ".inst 0x808403c2 // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
+ ".inst 0x808c03c3 // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
+ ".inst 0xa1434756 // ld1w { z22.s, z30.s }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa143468c // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x20, #0x6, MUL VL]\n"
+ "addvl x20, x20, #8\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+ ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
+ ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
+ ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+ ".inst 0x80820100 // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
+ ".inst 0x80830101 // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
+ ".inst 0x80820122 // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
+ ".inst 0x80830123 // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
+ ".inst 0x80900240 // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
+ ".inst 0x80910241 // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
+ ".inst 0x80900342 // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
+ ".inst 0x80910343 // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
+ ".inst 0x808402c0 // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
+ ".inst 0x808c02c1 // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
+ ".inst 0x808403c2 // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
+ ".inst 0x808c03c3 // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
+ "8:" // K oddments
+ "cbz x21, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa1404747 // ld1w { z7.s, z15.s }, pn9.b/Z, [x26]\n"
+ "subs x21, x21, #0x1\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xa1404697 // ld1w { z23.s, z31.s }, pn9.b/Z, [x20]\n"
+ "addvl x20, x20, #2\n"
+ ".inst 0x809700e0 // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+ ".inst 0x809f00e1 // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
+ ".inst 0x809701e2 // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
+ ".inst 0x809f01e3 // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 30f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5ac // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 30f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
+ "sub x24, x11, x10\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "tbz x15, #2, 21f\n"
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Skip activation: Accumulator row 0 loop
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604327 // st1w { z7.s, z15.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Skip activation: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 21f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 19f\n"
+ "18:" // Store to output array: Skip activation: Accumulator row 1 loop
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604327 // st1w { z7.s, z15.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Skip activation: Accumulator row 1 oddments
+ "cbz x19, 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa1604334 // st1w { z20.s, z28.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604335 // st1w { z21.s, z29.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ ".inst 0xa1604336 // st1w { z22.s, z30.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 21f\n"
+ "b 28f\n"
+ "21:" // Store to output array: Skip activation: End
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 23f\n"
+ "22:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa4 // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4caac // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604327 // st1w { z7.s, z15.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 22b\n"
+ "23:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 24f\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xc1b4caa0 // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4caa8 // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604320 // st1w { z0.s, z8.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604321 // st1w { z1.s, z9.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ ".inst 0xa1604322 // st1w { z2.s, z10.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "24:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 26f\n"
+ "25:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xa1604330 // st1w { z16.s, z24.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604331 // st1w { z17.s, z25.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604332 // st1w { z18.s, z26.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ ".inst 0xa1604333 // st1w { z19.s, z27.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 25b\n"
+ "26:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 27f\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xc1b4cab0 // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4cab8 // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604330 // st1w { z16.s, z24.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604331 // st1w { z17.s, z25.s }, p8, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ ".inst 0xa1604332 // st1w { z18.s, z26.s }, p8, [x25]\n"
+ "27:" // Store to output array: Accumulator row 1 oddments: End
+ "28:" // Store to output array: End
+ "tbz x15, #0, 30f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "29:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 29b\n"
+ "30:" // End block
+ "incw x9, ALL, MUL #2\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #2\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..a315ebb323
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<float>() * 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<float>() * 1;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return true;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_4VLx1VL;
+
+ StdTransformsSME<operand_type, result_type, 4, 1, 1> transforms = {};
+
+ cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..5252e8140b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const float *const A,
+ const float *const B,
+ float *const C, const int ldc,
+ const int M, const int N, const int K,
+ const float *const bias,
+ const Activation act,
+ bool accumulate,
+ float *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(K * sizeof(float)),
+ C(C), ldcb(ldc * sizeof(float)),
+ M(M), N(N), K(K),
+ n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+ min(-std::numeric_limits<float>::infinity()),
+ max(std::numeric_limits<float>::infinity()),
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (act.type == Activation::Type::None)
+ {
+ flags |= 1 << 2; // SKIP_ACTIVATION
+ }
+
+ // Initialise the activation values
+ switch (act.type)
+ {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ this->max = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ this->min = static_cast<float>(0);
+ break;
+ }
+ }
+
+ const float *const A;
+ const float *const B;
+ const long kstride_bytes;
+ float *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ float min = -std::numeric_limits<float>::infinity();
+ float max = std::numeric_limits<float>::infinity();
+
+ const float *const bias;
+
+ float *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ "whilelt p0.s, x9, x28\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "fmov z8.s, #1.0\n"
+ "ldnt1w { z27.s }, p0/Z, [x19, x9, LSL #2]\n"
+ ".inst 0x809b2500 // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
+ ".inst 0x809b2501 // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
+ ".inst 0x809b2502 // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
+ ".inst 0x809b2503 // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19\n"
+ "incw x20, ALL, MUL #4\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "lsr x22, x19, #0x2\n"
+ "and x21, x19, #0x3\n"
+ "ldr x20, [%x[args], %[offsetof_B]]\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x20, x9, x19, x20\n" // bptr = B + n * kstride_bytes
+ "cbz x22, 8f\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa040c344 // ld1w { z4.s-z7.s }, pn8.b/Z, [x26]\n"
+ "ldnt1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0xa041c34c // ld1w { z12.s-z15.s }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ "ldnt1w { z23.s }, p1/Z, [x20, #1, MUL VL]\n"
+ ".inst 0xa042c340 // ld1w { z0.s-z3.s }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1w { z21.s }, p1/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xa143c352 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "addvl x20, x20, #4\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
+ ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
+ ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+ ".inst 0xa040c344 // ld1w { z4.s-z7.s }, pn8.b/Z, [x26]\n"
+ ".inst 0x80972580 // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
+ "ldnt1w { z29.s }, p1/Z, [x20]\n"
+ ".inst 0x809725a1 // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
+ ".inst 0x809725c2 // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
+ ".inst 0x809725e3 // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
+ ".inst 0xa041c34c // ld1w { z12.s-z15.s }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0x80952400 // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
+ "ldnt1w { z23.s }, p1/Z, [x20, #1, MUL VL]\n"
+ ".inst 0x80952421 // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
+ ".inst 0x80952442 // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
+ ".inst 0x80952463 // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
+ ".inst 0xa042c340 // ld1w { z0.s-z3.s }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1w { z21.s }, p1/Z, [x20, #2, MUL VL]\n"
+ ".inst 0x809b2640 // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
+ ".inst 0x809b26c1 // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
+ ".inst 0x809b2742 // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
+ ".inst 0x809b27c3 // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
+ ".inst 0xa143c352 // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "addvl x20, x20, #4\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+ ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
+ ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
+ ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+ ".inst 0x80972580 // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
+ ".inst 0x809725a1 // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
+ ".inst 0x809725c2 // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
+ ".inst 0x809725e3 // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
+ ".inst 0x80952400 // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
+ ".inst 0x80952421 // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
+ ".inst 0x80952442 // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
+ ".inst 0x80952463 // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
+ ".inst 0x809b2640 // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
+ ".inst 0x809b26c1 // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
+ ".inst 0x809b2742 // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
+ ".inst 0x809b27c3 // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
+ "8:" // K oddments
+ "cbz x21, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa040c344 // ld1w { z4.s-z7.s }, pn8.b/Z, [x26]\n"
+ "subs x21, x21, #0x1\n"
+ "addvl x26, x26, #4\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "addvl x20, x20, #1\n"
+ ".inst 0x809d2480 // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+ ".inst 0x809d24a1 // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
+ ".inst 0x809d24c2 // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
+ ".inst 0x809d24e3 // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c1c8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1c8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840502 // mova za2h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa043c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c1a4 // st1w { z4.s-z7.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1a0 // st1w { z0.s-z3.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 42f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1ac // st1w { z12.s-z15.s }, pn8.b, [x13]\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c1a0 // st1w { z0.s-z3.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 42f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
+ "sub x24, x11, x10\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "tbz x15, #2, 27f\n"
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Skip activation: Accumulator row 0 loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Skip activation: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ "st1w { z4.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z5.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 17f\n"
+ "st1w { z6.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "17:" // Store to output array: Skip activation: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 19f\n"
+ "18:" // Store to output array: Skip activation: Accumulator row 1 loop
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Skip activation: Accumulator row 1 oddments
+ "cbz x19, 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ "st1w { z4.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z5.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 20f\n"
+ "st1w { z6.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "20:" // Store to output array: Skip activation: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 22f\n"
+ "21:" // Store to output array: Skip activation: Accumulator row 2 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 21b\n"
+ "22:" // Store to output array: Skip activation: Accumulator row 2 oddments
+ "cbz x19, 23f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 23f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 23f\n"
+ "st1w { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "23:" // Store to output array: Skip activation: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 25f\n"
+ "24:" // Store to output array: Skip activation: Accumulator row 3 loop
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ "st1w { z4.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z5.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z6.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z7.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 24b\n"
+ "25:" // Store to output array: Skip activation: Accumulator row 3 oddments
+ "cbz x19, 26f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ "st1w { z12.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 26f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z13.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 26f\n"
+ "st1w { z14.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "26:" // Store to output array: Skip activation: Accumulator row 3 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "b 40f\n"
+ "27:" // Store to output array: Skip activation: End
+ "cntw x22\n"
+ "cmp x24, x22\n"
+ "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 29f\n"
+ "28:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z23.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 28b\n"
+ "29:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 30f\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb28 // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
+ "st1w { z8.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 30f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z9.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 30f\n"
+ "st1w { z10.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "30:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 40f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 32f\n"
+ "31:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 31b\n"
+ "32:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 33f\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 33f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 33f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "33:" // Store to output array: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 40f\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 35f\n"
+ "34:" // Store to output array: Accumulator row 2 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 34b\n"
+ "35:" // Store to output array: Accumulator row 2 oddments
+ "cbz x19, 36f\n"
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 36f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 36f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "36:" // Store to output array: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 40f\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 38f\n"
+ "37:" // Store to output array: Accumulator row 3 loop
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1b8cb34 // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ "st1w { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1w { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z23.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 37b\n"
+ "38:" // Store to output array: Accumulator row 3 oddments
+ "cbz x19, 39f\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1b8cb30 // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 39f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 39f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "39:" // Store to output array: Accumulator row 3 oddments: End
+ "40:" // Store to output array: End
+ "tbz x15, #0, 42f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "41:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 41b\n"
+ "42:" // End block
+ "incw x9\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #4\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..b8bcd53c21
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<int32_t>() * 1;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_1VLx4VL;
+
+ StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
+
+ cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..62170c4945
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const int8_t *const A,
+ const int8_t *const B,
+ int8_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+ const Requantize32 &rq,
+ const int n_0,
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+ C(C), ldcb(ldc * sizeof(int8_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias), n_0(n_0),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (rq.per_channel_requant)
+ {
+ flags |= 1 << 2; // PER_CHANNEL_QUANTISATION
+ }
+ }
+
+ const int8_t *const A;
+ const int8_t *const B;
+ const long kstride_bytes;
+ int8_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<int8_t>::min();
+ int32_t max = std::numeric_limits<int8_t>::max();
+
+ const int32_t *const bias;
+ const int n_0;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x13, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x13, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x11, x11, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w9, [%x[args], %[offsetof_M]]\n"
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "ldr w26, [%x[args], %[offsetof_N]]\n"
+ "ldr x25, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x24, x25\n"
+ ".inst 0x25ba6770 // whilelt pn8.s, x27, x26, VLx4\n"
+ "tbnz x13, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ ".inst 0xa01bc279 // ldnt1w { z24.s-z27.s }, p8/Z, [x19, x27, LSL #2]\n"
+ ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
+ ".inst 0xc0902742 // addha za2.s, p1/M, p1/M, z26.s\n"
+ ".inst 0xc0902763 // addha za3.s, p1/M, p1/M, z27.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x27\n"
+ "mov x20, x28\n"
+ "incw x19, ALL, MUL #4\n"
+ "incw x20\n"
+ "cmp x19, x26\n"
+ "csel x20, x28, x20, LT\n"
+ "mov x19, x13\n"
+ "bfm x13, XZR, #0x0, #0x0 // bfc x13, #0x0, #0x1\n"
+ "cmp x20, x9\n"
+ "csel x13, x19, x13, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x27, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z10.b }, p1/Z, [x24]\n"
+ ".inst 0xa04086dd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+ "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa04186cd // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286d9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ ".inst 0xa04386c1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+ ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ "ld1b { z10.b }, p1/Z, [x24]\n"
+ ".inst 0xa08c2600 // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+ ".inst 0xa04086dd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa08d2601 // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+ ".inst 0xa08e2602 // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+ ".inst 0xa08f2603 // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+ "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa09826a0 // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa04186cd // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa09926a1 // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+ ".inst 0xa09a26a2 // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+ ".inst 0xa09b26a3 // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+ "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286d9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ ".inst 0xa0802660 // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+ ".inst 0xa0812661 // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+ ".inst 0xa0822662 // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+ ".inst 0xa0832663 // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+ "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ ".inst 0xa04386c1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+ ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ ".inst 0xa08c2600 // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+ ".inst 0xa08d2601 // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+ ".inst 0xa08e2602 // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+ ".inst 0xa08f2603 // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+ ".inst 0xa09826a0 // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa09926a1 // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+ ".inst 0xa09a26a2 // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+ ".inst 0xa09b26a3 // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+ ".inst 0xa0802660 // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+ ".inst 0xa0812661 // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+ ".inst 0xa0822662 // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+ ".inst 0xa0832663 // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ "ld1b { z10.b }, p1/Z, [x24]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x24, x24, #1\n"
+ ".inst 0xa04086dc // ld1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0xa09c2540 // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ ".inst 0xa09d2541 // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa09e2542 // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+ ".inst 0xa09f2543 // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "ld1w { z14.s }, p1/Z, [x24]\n"
+ "addvl x24, x24, #1\n"
+ ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c2 // addva za2.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c3 // addva za3.s, p1/M, p1/M, z14.s\n"
+ "tbz x13, #1, 14f\n"
+ "tbz x13, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c578 // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c55c // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+ "addvl x11, x11, #16\n"
+ ".inst 0xa061c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ ".inst 0xa062c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "blt 11b\n"
+ "b 21f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c55c // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa061c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "blt 13b\n"
+ "b 21f\n"
+ "14:" // Store to output array
+ "ldr x23, [%x[args], %[offsetof_C]]\n"
+ "add x23, x23, x27\n" // C += n
+ "sub x22, x9, x28\n"
+ "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ldr x21, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x23, x28, x21, x23\n" // C += m * ldc
+ "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "tbz x13, #2, 15f\n"
+ "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+ "add x20, x20, x27\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa040c26c // ld1w { z12.s-z15.s }, p8/Z, [x19]\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa040c264 // ld1w { z4.s-z7.s }, p8/Z, [x19]\n"
+ "15:" // Store to output array: Load per-channel parameters: End
+ "cntw x19\n"
+ "whilelt p0.b, x27, x26\n"
+ "cmp x22, x19\n"
+ "csel x19, x22, x19, LT\n"
+ "lsr x20, x19, #0x1\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x1\n"
+ "cbz x20, 17f\n"
+ "16:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+ ".inst 0xc1aca41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1ada41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+ ".inst 0xc1aea416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x20, LSL #1\n"
+ ".inst 0xc1afa410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+ ".inst 0xc1a4a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+ ".inst 0xc1a5a23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+ ".inst 0xc1a6a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
+ ".inst 0xc1a7a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+ ".inst 0xc1a1a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
+ ".inst 0xc1a1a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
+ ".inst 0xc1a1a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
+ ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+ ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+ "uzp1 z19.b, z26.b, z28.b\n"
+ ".inst 0xc1b4c6b6 // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z22.b, z16.b\n"
+ "uzp1 z18.b, z27.b, z29.b\n"
+ "uzp1 z17.b, z23.b, z17.b\n"
+ "uzp1 z16.b, z19.b, z16.b\n"
+ "st1b { z16.b }, p0, [x23]\n"
+ "add x23, x23, x21\n"
+ "uzp1 z16.b, z18.b, z17.b\n"
+ "st1b { z16.b }, p0, [x23]\n"
+ "add x23, x23, x21\n"
+ "blt 16b\n"
+ "17:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 18f\n"
+ ".inst 0xc0860002 // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
+ ".inst 0xc1aca402 // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
+ ".inst 0xc0860090 // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600ca // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1ada418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+ ".inst 0xc1aea410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
+ ".inst 0xc1afa40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
+ ".inst 0xc1a4a222 // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
+ ".inst 0xc1a5a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+ ".inst 0xc1a6a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
+ ".inst 0xc1a7a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
+ ".inst 0xc1a1a302 // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
+ ".inst 0xc1a1a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
+ ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+ ".inst 0xc1a1a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
+ ".inst 0xc1b4c6a2 // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+ "uzp1 z23.b, z2.b, z24.b\n"
+ ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z16.b, z10.b\n"
+ "uzp1 z16.b, z23.b, z16.b\n"
+ "st1b { z16.b }, p0, [x23]\n"
+ "18:" // Store to output array: Accumulator row 0 oddments: End
+ "19:" // Store to output array: End
+ "tbz x13, #0, 21f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "20:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x11, x11, #16\n"
+ "blt 20b\n"
+ "21:" // End block
+ "incw x27, ALL, MUL #4\n"
+ "cmp x27, x26\n"
+ "blt 3b\n"
+ "incw x28\n"
+ "cmp x28, x9\n"
+ "mov x27, #0x0\n"
+ "mov x25, x24\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..954b0da0e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<int32_t>() * 2;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 2;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_2VLx2VL;
+
+ StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
+
+ cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..e565699af5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const int8_t *const A,
+ const int8_t *const B,
+ int8_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+ const Requantize32 &rq,
+ const int n_0,
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+ C(C), ldcb(ldc * sizeof(int8_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias), n_0(n_0),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (rq.per_channel_requant)
+ {
+ flags |= 1 << 2; // PER_CHANNEL_QUANTISATION
+ }
+ }
+
+ const int8_t *const A;
+ const int8_t *const B;
+ const long kstride_bytes;
+ int8_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<int8_t>::min();
+ int32_t max = std::numeric_limits<int8_t>::max();
+
+ const int32_t *const bias;
+ const int n_0;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ ".inst 0x25bc4530 // whilelt pn8.s, x9, x28, VLx2\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ ".inst 0xa0094275 // ldnt1w { z20.s-z21.s }, p8/Z, [x19, x9, LSL #2]\n"
+ ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n"
+ ".inst 0xc09026a1 // addha za1.s, p1/M, p1/M, z21.s\n"
+ ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n"
+ ".inst 0xc09026a3 // addha za3.s, p1/M, p1/M, z21.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19, ALL, MUL #2\n"
+ "incw x20, ALL, MUL #2\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa040075e // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+ ".inst 0xa04006d1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa041074e // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa04106c9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0420740 // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa14206dc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0430744 // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14306ca // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+ ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+ ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ ".inst 0xa040075e // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+ ".inst 0xa08825c0 // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+ ".inst 0xa04006d1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa08925c1 // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+ ".inst 0xa08825e2 // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+ ".inst 0xa08925e3 // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+ ".inst 0xa041074e // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa0942400 // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+ ".inst 0xa04106c9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa09c2401 // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+ ".inst 0xa0942422 // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+ ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+ ".inst 0xa0420740 // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa14206dc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0822480 // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+ ".inst 0xa08a2481 // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+ ".inst 0xa08224a2 // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+ ".inst 0xa08a24a3 // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+ ".inst 0xa0430744 // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14306ca // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+ ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+ ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ ".inst 0xa08825c0 // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+ ".inst 0xa08925c1 // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+ ".inst 0xa08825e2 // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+ ".inst 0xa08925e3 // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+ ".inst 0xa0942400 // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+ ".inst 0xa09c2401 // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+ ".inst 0xa0942422 // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+ ".inst 0xa09c2423 // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+ ".inst 0xa0822480 // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+ ".inst 0xa08a2481 // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+ ".inst 0xa08224a2 // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+ ".inst 0xa08a24a3 // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa040075e // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xa04006d0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #2\n"
+ ".inst 0xa09027c0 // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ ".inst 0xa09127c1 // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+ ".inst 0xa09027e2 // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+ ".inst 0xa09127e3 // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ ".inst 0xa040474e // ld1w { z14.s-z15.s }, pn9.b/Z, [x26]\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 24f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 24f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9\n" // C += n
+ "sub x24, x11, x10\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "tbz x15, #2, 15f\n"
+ "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+ "add x20, x20, x9\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa0404262 // ld1w { z2.s-z3.s }, p8/Z, [x19]\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa0404260 // ld1w { z0.s-z1.s }, p8/Z, [x19]\n"
+ "15:" // Store to output array: Load per-channel parameters: End
+ "cntw x22\n"
+ "whilelt p0.h, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 17f\n"
+ "16:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+ ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z12.h, z28.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z13.h, z29.h\n"
+ "uzp1 z17.h, z14.h, z30.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z15.h, z31.h\n"
+ "st1b { z17.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 16b\n"
+ "17:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 18f\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+ ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a0aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+ ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z28.h, z12.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "subs x19, x19, #0x1\n"
+ "uzp1 z16.h, z29.h, z13.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "uzp1 z16.h, z30.h, z14.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "18:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 22f\n"
+ "whilelt p0.h, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 20f\n"
+ "19:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1abab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
+ ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z16.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z5.h, z17.h\n"
+ "uzp1 z17.h, z6.h, z18.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z7.h, z19.h\n"
+ "st1b { z17.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 19b\n"
+ "20:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 21f\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
+ ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a0aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1abab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z20.h, z16.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "subs x19, x19, #0x1\n"
+ "uzp1 z16.h, z21.h, z17.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "uzp1 z16.h, z22.h, z18.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "21:" // Store to output array: Accumulator row 1 oddments: End
+ "22:" // Store to output array: End
+ "tbz x15, #0, 24f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "23:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 23b\n"
+ "24:" // End block
+ "incw x9, ALL, MUL #2\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #2\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..420c219af5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<int32_t>() * 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 1;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_4VLx1VL;
+
+ StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
+
+ cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..a738a10418
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const int8_t *const A,
+ const int8_t *const B,
+ int8_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+ const Requantize32 &rq,
+ const int n_0,
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+ C(C), ldcb(ldc * sizeof(int8_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias), n_0(n_0),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (rq.per_channel_requant)
+ {
+ flags |= 1 << 2; // PER_CHANNEL_QUANTISATION
+ }
+ }
+
+ const int8_t *const A;
+ const int8_t *const B;
+ const long kstride_bytes;
+ int8_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<int8_t>::min();
+ int32_t max = std::numeric_limits<int8_t>::max();
+
+ const int32_t *const bias;
+ const int n_0;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ "whilelt p0.s, x9, x28\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "ldnt1w { z15.s }, p0/Z, [x19, x9, LSL #2]\n"
+ ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19\n"
+ "incw x20, ALL, MUL #4\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1408352 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+ "ldnt1b { z0.b }, p1/Z, [x22]\n"
+ ".inst 0xa1418353 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa1428350 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa1438342 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+ ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+ ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ ".inst 0xa1408352 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xa0892660 // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+ "ldnt1b { z0.b }, p1/Z, [x22]\n"
+ ".inst 0xa08926e1 // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+ ".inst 0xa0892762 // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+ ".inst 0xa08927e3 // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+ ".inst 0xa1418353 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa0952600 // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+ "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa0952681 // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+ ".inst 0xa0952702 // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+ ".inst 0xa0952783 // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+ ".inst 0xa1428350 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa08c2440 // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+ ".inst 0xa08c24c1 // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+ ".inst 0xa08c2542 // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+ ".inst 0xa08c25c3 // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+ ".inst 0xa1438342 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+ ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+ ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ ".inst 0xa0892660 // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+ ".inst 0xa08926e1 // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+ ".inst 0xa0892762 // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+ ".inst 0xa08927e3 // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+ ".inst 0xa0952600 // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+ ".inst 0xa0952681 // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+ ".inst 0xa0952702 // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+ ".inst 0xa0952783 // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+ ".inst 0xa08c2440 // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+ ".inst 0xa08c24c1 // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+ ".inst 0xa08c2542 // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+ ".inst 0xa08c25c3 // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa1408352 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #4\n"
+ "ld1b { z0.b }, p1/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ ".inst 0xa0802640 // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ ".inst 0xa08026c1 // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+ ".inst 0xa0802742 // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+ ".inst 0xa08027c3 // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ ".inst 0xa040c340 // ld1w { z0.s-z3.s }, pn8.b/Z, [x26]\n"
+ "addvl x26, x26, #4\n"
+ ".inst 0xc0912400 // addva za0.s, p1/M, p1/M, z0.s\n"
+ ".inst 0xc0912421 // addva za1.s, p1/M, p1/M, z1.s\n"
+ ".inst 0xc0912442 // addva za2.s, p1/M, p1/M, z2.s\n"
+ ".inst 0xc0912463 // addva za3.s, p1/M, p1/M, z3.s\n"
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1bc // st1w { z28.s-z31.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 30f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c1b4 // st1w { z20.s-z23.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 30f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9\n" // C += n
+ "sub x24, x11, x10\n"
+ "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "tbz x15, #2, 15f\n"
+ "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+ "add x20, x20, x9\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ "ld1w { z8.s }, p0/Z, [x19]\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ "ld1w { z7.s }, p0/Z, [x19]\n"
+ "15:" // Store to output array: Load per-channel parameters: End
+ "cntw x22\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 17f\n"
+ "16:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+ "st1b { z12.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z13.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z14.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z15.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 16b\n"
+ "17:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 18f\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+ "st1b { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "st1b { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "18:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 20f\n"
+ "19:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+ "st1b { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 19b\n"
+ "20:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 21f\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a4ccbc // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
+ "st1b { z28.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z29.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "st1b { z30.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "21:" // Store to output array: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 23f\n"
+ "22:" // Store to output array: Accumulator row 2 loop
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1a4ccb8 // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
+ "st1b { z24.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z25.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z26.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z27.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 22b\n"
+ "23:" // Store to output array: Accumulator row 2 oddments
+ "cbz x19, 24f\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+ "st1b { z12.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z13.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ "st1b { z14.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "24:" // Store to output array: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 26f\n"
+ "25:" // Store to output array: Accumulator row 3 loop
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+ ".inst 0xc1a4ccb4 // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
+ "st1b { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z23.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 25b\n"
+ "26:" // Store to output array: Accumulator row 3 oddments
+ "cbz x19, 27f\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
+ ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a4cca0 // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
+ "st1b { z0.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z1.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ "st1b { z2.s }, p0, [x25]\n"
+ "27:" // Store to output array: Accumulator row 3 oddments: End
+ "28:" // Store to output array: End
+ "tbz x15, #0, 30f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "29:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 29b\n"
+ "30:" // End block
+ "incw x9\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #4\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..c969c7aaff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<int32_t>() * 1;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL;
+
+ StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
+
+ cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..7ddd7c2e09
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+ ARM_COMPUTE_UNUSED(act);
+
+ struct KernelArgs
+ {
+ KernelArgs(
+ const int8_t *const A,
+ const int8_t *const B,
+ int32_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+ C(C), ldcb(ldc * sizeof(int32_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ }
+
+ const int8_t *const A;
+ const int8_t *const B;
+ const long kstride_bytes;
+ int32_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+
+ const int32_t *const bias;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x11, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p0.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x9, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x11, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c544 // ld1w { z4.s-z7.s }, pn9.b/Z, [x10]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c55c // ld1w { z28.s-z31.s }, pn9.b/Z, [x10, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa042c550 // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c540 // ld1w { z0.s-z3.s }, pn9.b/Z, [x10, #0xc, MUL VL]\n"
+ ".inst 0xc0840403 // mova za3h.s[x12], { z0.s-z3.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x10, x10, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w28, [%x[args], %[offsetof_M]]\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x0\n"
+ "ldr w25, [%x[args], %[offsetof_N]]\n"
+ "ldr x24, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x23, x24\n"
+ ".inst 0x25b96750 // whilelt pn8.s, x26, x25, VLx4\n"
+ "tbnz x11, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ ".inst 0xa11ac26a // ldnt1w { z2.s, z6.s, z10.s, z14.s }, p8/Z, [x19, x26, LSL #2]\n"
+ ".inst 0xc0900040 // addha za0.s, p0/M, p0/M, z2.s\n"
+ ".inst 0xc09000c1 // addha za1.s, p0/M, p0/M, z6.s\n"
+ ".inst 0xc0900142 // addha za2.s, p0/M, p0/M, z10.s\n"
+ ".inst 0xc09001c3 // addha za3.s, p0/M, p0/M, z14.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x26\n"
+ "mov x20, x27\n"
+ "incw x19, ALL, MUL #4\n"
+ "incw x20\n"
+ "cmp x19, x25\n"
+ "csel x20, x27, x20, LT\n"
+ "mov x19, x11\n"
+ "bfm x11, XZR, #0x0, #0x0 // bfc x11, #0x0, #0x1\n"
+ "cmp x20, x28\n"
+ "csel x11, x19, x11, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x26, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z20.b }, p0/Z, [x23]\n"
+ ".inst 0xa14086c9 // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x22]\n"
+ "ld1b { z10.b }, p0/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa14186da // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ "ld1b { z16.b }, p0/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa14286cb // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ "ld1b { z25.b }, p0/Z, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ ".inst 0xa14386c8 // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
+ ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
+ ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+ "ld1b { z20.b }, p0/Z, [x23]\n"
+ ".inst 0xa0920140 // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
+ ".inst 0xa14086c9 // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa0960141 // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
+ ".inst 0xa09a0142 // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
+ ".inst 0xa09e0143 // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
+ "ld1b { z10.b }, p0/Z, [x23, #1, MUL VL]\n"
+ ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+ ".inst 0xa14186da // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0870201 // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
+ ".inst 0xa08b0202 // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
+ ".inst 0xa08f0203 // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
+ "ld1b { z16.b }, p0/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xa14286cb // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ ".inst 0xa0800320 // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
+ ".inst 0xa0840321 // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
+ ".inst 0xa0880322 // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
+ ".inst 0xa08c0323 // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
+ "ld1b { z25.b }, p0/Z, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ ".inst 0xa14386c8 // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+ ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
+ ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
+ ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+ ".inst 0xa0920140 // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
+ ".inst 0xa0960141 // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
+ ".inst 0xa09a0142 // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
+ ".inst 0xa09e0143 // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
+ ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+ ".inst 0xa0870201 // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
+ ".inst 0xa08b0202 // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
+ ".inst 0xa08f0203 // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
+ ".inst 0xa0800320 // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
+ ".inst 0xa0840321 // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
+ ".inst 0xa0880322 // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
+ ".inst 0xa08c0323 // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ "ld1b { z20.b }, p0/Z, [x23]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x23, x23, #1\n"
+ ".inst 0xa14086c1 // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0xa0810280 // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+ ".inst 0xa0850281 // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
+ ".inst 0xa0890282 // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
+ ".inst 0xa08d0283 // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x11, #1, 14f\n"
+ "tbz x11, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c55c // ld1w { z28.s-z31.s }, pn9.b/Z, [x10]\n"
+ ".inst 0xc0860404 // mova { z4.s-z7.s }, za0h.s[x12]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa041c540 // ld1w { z0.s-z3.s }, pn9.b/Z, [x10, #0x4, MUL VL]\n"
+ ".inst 0xc0840401 // mova za1h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c550 // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c54c // ld1w { z12.s-z15.s }, pn9.b/Z, [x10, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c524 // st1w { z4.s-z7.s }, pn9.b, [x9]\n"
+ "addvl x10, x10, #16\n"
+ ".inst 0xa061c534 // st1w { z20.s-z23.s }, pn9.b, [x9, #0x4, MUL VL]\n"
+ ".inst 0xa062c538 // st1w { z24.s-z27.s }, pn9.b, [x9, #0x8, MUL VL]\n"
+ ".inst 0xa063c53c // st1w { z28.s-z31.s }, pn9.b, [x9, #0xc, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "blt 11b\n"
+ "b 20f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860414 // mova { z20.s-z23.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c534 // st1w { z20.s-z23.s }, pn9.b, [x9]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c520 // st1w { z0.s-z3.s }, pn9.b, [x9, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c528 // st1w { z8.s-z11.s }, pn9.b, [x9, #0x8, MUL VL]\n"
+ ".inst 0xa063c52c // st1w { z12.s-z15.s }, pn9.b, [x9, #0xc, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "blt 13b\n"
+ "b 20f\n"
+ "14:" // Store to output array
+ "ldr x22, [%x[args], %[offsetof_C]]\n"
+ "sub x20, x28, x27\n"
+ "cntw x19\n"
+ "ldr x21, [%x[args], %[offsetof_ldcb]]\n"
+ "cmp x20, x19\n"
+ "csel x19, x20, x19, LT\n"
+ "add x22, x22, x26, LSL #2\n" // C += n
+ "lsr x20, x19, #0x2\n"
+ "madd x22, x27, x21, x22\n" // C += m * ldc
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa160c2c0 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x22]\n"
+ "add x22, x22, x21\n"
+ ".inst 0xa160c2c1 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x22]\n"
+ "add x22, x22, x21\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa160c2c2 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x22]\n"
+ "add x22, x22, x21\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa160c2c3 // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x22]\n"
+ "add x22, x22, x21\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa160c2c0 // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x22]\n"
+ "add x22, x22, x21\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa160c2c1 // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x22]\n"
+ "add x22, x22, x21\n"
+ "beq 17f\n"
+ ".inst 0xa160c2c2 // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x22]\n"
+ "17:" // Store to output array: Accumulator row 0 oddments: End
+ "18:" // Store to output array: End
+ "tbz x11, #0, 20f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "19:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c54c // ld1w { z12.s-z15.s }, pn9.b/Z, [x10]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c550 // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c550 // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c544 // ld1w { z4.s-z7.s }, pn9.b/Z, [x10, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x10, x10, #16\n"
+ "blt 19b\n"
+ "20:" // End block
+ "incw x26, ALL, MUL #4\n"
+ "cmp x26, x25\n"
+ "blt 3b\n"
+ "incw x27\n"
+ "cmp x27, x28\n"
+ "mov x26, #0x0\n"
+ "mov x24, x23\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..a0705e50cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<int32_t>() * 2;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 2;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL;
+
+ StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
+
+ cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..9ae18f0e6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+ ARM_COMPUTE_UNUSED(act);
+
+ struct KernelArgs
+ {
+ KernelArgs(
+ const int8_t *const A,
+ const int8_t *const B,
+ int32_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+ C(C), ldcb(ldc * sizeof(int32_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ }
+
+ const int8_t *const A;
+ const int8_t *const B;
+ const long kstride_bytes;
+ int32_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+
+ const int32_t *const bias;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p0.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5c8 // ld1w { z8.s-z11.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840500 // mova za0h.s[x12], { z8.s-z11.s }\n"
+ ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840603 // mova za3h.s[x12], { z16.s-z19.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ ".inst 0x25bc4530 // whilelt pn8.s, x9, x28, VLx2\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ ".inst 0xa109427c // ldnt1w { z20.s, z28.s }, p8/Z, [x19, x9, LSL #2]\n"
+ ".inst 0xc0900280 // addha za0.s, p0/M, p0/M, z20.s\n"
+ ".inst 0xc0900381 // addha za1.s, p0/M, p0/M, z28.s\n"
+ ".inst 0xc0900282 // addha za2.s, p0/M, p0/M, z20.s\n"
+ ".inst 0xc0900383 // addha za3.s, p0/M, p0/M, z28.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19, ALL, MUL #2\n"
+ "incw x20, ALL, MUL #2\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1400756 // ld1b { z22.b, z30.b }, pn9.b/Z, [x26]\n"
+ ".inst 0xa14006d9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa1410750 // ld1b { z16.b, z24.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa14106cb // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0420748 // ld1b { z8.b-z9.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa04206d3 // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0430744 // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14306dd // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
+ ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
+ ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+ ".inst 0xa1400756 // ld1b { z22.b, z30.b }, pn9.b/Z, [x26]\n"
+ ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+ ".inst 0xa14006d9 // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa08b0201 // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
+ ".inst 0xa0830302 // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
+ ".inst 0xa08b0303 // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
+ ".inst 0xa1410750 // ld1b { z16.b, z24.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa0920100 // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
+ ".inst 0xa14106cb // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0930101 // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
+ ".inst 0xa0920122 // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
+ ".inst 0xa0930123 // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
+ ".inst 0xa0420748 // ld1b { z8.b-z9.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa04206d3 // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0950080 // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
+ ".inst 0xa09d0081 // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
+ ".inst 0xa09500a2 // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
+ ".inst 0xa09d00a3 // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+ ".inst 0xa0430744 // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14306dd // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+ ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
+ ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
+ ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+ ".inst 0xa0830200 // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+ ".inst 0xa08b0201 // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
+ ".inst 0xa0830302 // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
+ ".inst 0xa08b0303 // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
+ ".inst 0xa0920100 // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
+ ".inst 0xa0930101 // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
+ ".inst 0xa0920122 // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
+ ".inst 0xa0930123 // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
+ ".inst 0xa0950080 // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
+ ".inst 0xa09d0081 // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
+ ".inst 0xa09500a2 // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
+ ".inst 0xa09d00a3 // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa1400756 // ld1b { z22.b, z30.b }, pn9.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xa14006d1 // ld1b { z17.b, z25.b }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #2\n"
+ ".inst 0xa09102c0 // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+ ".inst 0xa09902c1 // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
+ ".inst 0xa09103c2 // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
+ ".inst 0xa09903c3 // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860418 // mova { z24.s-z27.s }, za0h.s[x12]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860468 // mova { z8.s-z11.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840582 // mova za2h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c5b8 // st1w { z24.s-z27.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 23f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5a0 // st1w { z0.s-z3.s }, pn9.b, [x13]\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860464 // mova { z4.s-z7.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 23f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "sub x24, x11, x10\n"
+ "cntw x23\n"
+ "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+ "cmp x24, x23\n"
+ "csel x21, x24, x23, LT\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
+ "lsr x20, x21, #0x2\n"
+ "madd x25, x10, x22, x25\n" // C += m * ldc
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860438 // mova { z24.s-z27.s }, za1h.s[x12]\n"
+ ".inst 0xa1604330 // st1w { z16.s, z24.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ ".inst 0xa1604331 // st1w { z17.s, z25.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604332 // st1w { z18.s, z26.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604333 // st1w { z19.s, z27.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa1604320 // st1w { z0.s, z8.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604321 // st1w { z1.s, z9.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 17f\n"
+ ".inst 0xa1604322 // st1w { z2.s, z10.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "17:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 21f\n"
+ "cmp x24, x23\n"
+ "csel x19, x24, x23, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 19f\n"
+ "18:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa1604330 // st1w { z16.s, z24.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ ".inst 0xa1604331 // st1w { z17.s, z25.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xa1604332 // st1w { z18.s, z26.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xa1604333 // st1w { z19.s, z27.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa1604324 // st1w { z4.s, z12.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xa1604325 // st1w { z5.s, z13.s }, p8, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 20f\n"
+ ".inst 0xa1604326 // st1w { z6.s, z14.s }, p8, [x25]\n"
+ "20:" // Store to output array: Accumulator row 1 oddments: End
+ "21:" // Store to output array: End
+ "tbz x15, #0, 23f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "22:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 22b\n"
+ "23:" // End block
+ "incw x9, ALL, MUL #2\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #2\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..be1106da13
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<int32_t>() * 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<int32_t>() * 1;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL;
+
+ StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
+
+ cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..3623f5b6c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+ ARM_COMPUTE_UNUSED(act);
+
+ struct KernelArgs
+ {
+ KernelArgs(
+ const int8_t *const A,
+ const int8_t *const B,
+ int32_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+ C(C), ldcb(ldc * sizeof(int32_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ }
+
+ const int8_t *const A;
+ const int8_t *const B;
+ const long kstride_bytes;
+ int32_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+
+ const int32_t *const bias;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa042c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ "whilelt p0.s, x9, x28\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "ldnt1w { z15.s }, p0/Z, [x19, x9, LSL #2]\n"
+ ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19\n"
+ "incw x20, ALL, MUL #4\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa0408350 // ld1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "ldnt1b { z7.b }, p1/Z, [x22]\n"
+ ".inst 0xa041835c // ld1b { z28.b-z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ "ldnt1b { z13.b }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa0428340 // ld1b { z0.b-z3.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1b { z12.b }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa0438358 // ld1b { z24.b-z27.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1b { z23.b }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
+ ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
+ ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+ ".inst 0xa0408350 // ld1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xa08d2780 // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
+ "ldnt1b { z7.b }, p1/Z, [x22]\n"
+ ".inst 0xa08d27a1 // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
+ ".inst 0xa08d27c2 // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
+ ".inst 0xa08d27e3 // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
+ ".inst 0xa041835c // ld1b { z28.b-z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa08c2400 // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
+ "ldnt1b { z13.b }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa08c2421 // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
+ ".inst 0xa08c2442 // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
+ ".inst 0xa08c2463 // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
+ ".inst 0xa0428340 // ld1b { z0.b-z3.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1b { z12.b }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
+ ".inst 0xa0972721 // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
+ ".inst 0xa0972742 // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
+ ".inst 0xa0972763 // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
+ ".inst 0xa0438358 // ld1b { z24.b-z27.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1b { z23.b }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+ ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
+ ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
+ ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+ ".inst 0xa08d2780 // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
+ ".inst 0xa08d27a1 // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
+ ".inst 0xa08d27c2 // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
+ ".inst 0xa08d27e3 // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
+ ".inst 0xa08c2400 // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
+ ".inst 0xa08c2421 // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
+ ".inst 0xa08c2442 // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
+ ".inst 0xa08c2463 // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
+ ".inst 0xa0972700 // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
+ ".inst 0xa0972721 // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
+ ".inst 0xa0972742 // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
+ ".inst 0xa0972763 // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa0408350 // ld1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #4\n"
+ "ld1b { z7.b }, p1/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ ".inst 0xa0872600 // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+ ".inst 0xa0872621 // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
+ ".inst 0xa0872642 // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
+ ".inst 0xa0872663 // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0860400 // mova { z0.s-z3.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c1a0 // st1w { z0.s-z3.s }, pn8.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c1bc // st1w { z28.s-z31.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 29f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13]\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1a4 // st1w { z4.s-z7.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1a0 // st1w { z0.s-z3.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 29f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "sub x24, x11, x10\n"
+ "cntw x23\n"
+ "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+ "cmp x24, x23\n"
+ "csel x21, x24, x23, LT\n"
+ "add x25, x25, x9, LSL #2\n" // C += n
+ "lsr x20, x21, #0x2\n"
+ "madd x25, x10, x22, x25\n" // C += m * ldc
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 16f\n"
+ "15:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ "st1w { z28.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "st1w { z29.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z30.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z31.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "blt 15b\n"
+ "16:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 17f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ "st1w { z8.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 17f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z9.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 17f\n"
+ "st1w { z10.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "17:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x23\n"
+ "csel x21, x24, x23, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 19f\n"
+ "18:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ "st1w { z0.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "st1w { z1.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z2.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z3.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "blt 18b\n"
+ "19:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 20f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 20f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 20f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "20:" // Store to output array: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x23\n"
+ "csel x21, x24, x23, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 22f\n"
+ "21:" // Store to output array: Accumulator row 2 loop
+ ".inst 0xc0860450 // mova { z16.s-z19.s }, za2h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z19.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "blt 21b\n"
+ "22:" // Store to output array: Accumulator row 2 oddments
+ "cbz x19, 23f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860440 // mova { z0.s-z3.s }, za2h.s[x12]\n"
+ "st1w { z0.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 23f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z1.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 23f\n"
+ "st1w { z2.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "23:" // Store to output array: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 27f\n"
+ "cmp x24, x23\n"
+ "csel x19, x24, x23, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 25f\n"
+ "24:" // Store to output array: Accumulator row 3 loop
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ "st1w { z12.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "st1w { z13.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "add x12, x12, #0x4\n"
+ "st1w { z14.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "cmp x12, x20, LSL #2\n"
+ "st1w { z15.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "blt 24b\n"
+ "25:" // Store to output array: Accumulator row 3 oddments
+ "cbz x19, 26f\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 26f\n"
+ "subs x19, x19, #0x1\n"
+ "st1w { z17.s }, p0, [x25]\n"
+ "add x25, x25, x22\n"
+ "beq 26f\n"
+ "st1w { z18.s }, p0, [x25]\n"
+ "26:" // Store to output array: Accumulator row 3 oddments: End
+ "27:" // Store to output array: End
+ "tbz x15, #0, 29f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "28:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c1c8 // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840503 // mova za3h.s[x12], { z8.s-z11.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 28b\n"
+ "29:" // End block
+ "incw x9\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #4\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..c7bd38d905
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint8_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<uint32_t>() * 1;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<uint32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_1VLx4VL;
+
+ StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
+
+ cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..100f15c7e0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const uint8_t *const A,
+ const uint8_t *const B,
+ uint8_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+ const Requantize32 &rq,
+ const int n_0,
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+ C(C), ldcb(ldc * sizeof(uint8_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias), n_0(n_0),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (rq.per_channel_requant)
+ {
+ flags |= 1 << 2; // PER_CHANNEL_QUANTISATION
+ }
+ }
+
+ const uint8_t *const A;
+ const uint8_t *const B;
+ const long kstride_bytes;
+ uint8_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<uint8_t>::min();
+ int32_t max = std::numeric_limits<uint8_t>::max();
+
+ const int32_t *const bias;
+ const int n_0;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x13, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x13, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0840580 // mova za0h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa041c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840781 // mova za1h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa042c57c // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840782 // mova za2h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x11, x11, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w9, [%x[args], %[offsetof_M]]\n"
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "ldr w26, [%x[args], %[offsetof_N]]\n"
+ "ldr x25, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x24, x25\n"
+ ".inst 0x25ba6770 // whilelt pn8.s, x27, x26, VLx4\n"
+ "tbnz x13, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ ".inst 0xa01bc279 // ldnt1w { z24.s-z27.s }, p8/Z, [x19, x27, LSL #2]\n"
+ ".inst 0xc0902700 // addha za0.s, p1/M, p1/M, z24.s\n"
+ ".inst 0xc0902721 // addha za1.s, p1/M, p1/M, z25.s\n"
+ ".inst 0xc0902742 // addha za2.s, p1/M, p1/M, z26.s\n"
+ ".inst 0xc0902763 // addha za3.s, p1/M, p1/M, z27.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x27\n"
+ "mov x20, x28\n"
+ "incw x19, ALL, MUL #4\n"
+ "incw x20\n"
+ "cmp x19, x26\n"
+ "csel x20, x28, x20, LT\n"
+ "mov x19, x13\n"
+ "bfm x13, XZR, #0x0, #0x0 // bfc x13, #0x0, #0x1\n"
+ "cmp x20, x9\n"
+ "csel x13, x19, x13, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x27, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z10.b }, p1/Z, [x24]\n"
+ ".inst 0xa04086dd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+ "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa04186cd // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286d9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ ".inst 0xa04386c1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+ ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ "ld1b { z10.b }, p1/Z, [x24]\n"
+ ".inst 0xa1ac2600 // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+ ".inst 0xa04086dd // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa1ad2601 // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+ ".inst 0xa1ae2602 // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+ ".inst 0xa1af2603 // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+ "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+ ".inst 0xa1b826a0 // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa04186cd // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa1b926a1 // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+ ".inst 0xa1ba26a2 // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+ ".inst 0xa1bb26a3 // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+ "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xa04286d9 // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+ ".inst 0xa1a02660 // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+ ".inst 0xa1a12661 // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+ ".inst 0xa1a22662 // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+ ".inst 0xa1a32663 // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+ "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+ "addvl x24, x24, #4\n"
+ ".inst 0xa04386c1 // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+ "addvl x22, x22, #16\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+ ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ ".inst 0xa1ac2600 // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+ ".inst 0xa1ad2601 // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+ ".inst 0xa1ae2602 // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+ ".inst 0xa1af2603 // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+ ".inst 0xa1b826a0 // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+ ".inst 0xa1b926a1 // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+ ".inst 0xa1ba26a2 // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+ ".inst 0xa1bb26a3 // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+ ".inst 0xa1a02660 // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+ ".inst 0xa1a12661 // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+ ".inst 0xa1a22662 // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+ ".inst 0xa1a32663 // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ "ld1b { z10.b }, p1/Z, [x24]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x24, x24, #1\n"
+ ".inst 0xa04086dc // ld1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #4\n"
+ ".inst 0xa1bc2540 // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+ ".inst 0xa1bd2541 // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+ ".inst 0xa1be2542 // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+ ".inst 0xa1bf2543 // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ "ld1w { z14.s }, p1/Z, [x24]\n"
+ "addvl x24, x24, #1\n"
+ ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c2 // addva za2.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c3 // addva za3.s, p1/M, p1/M, z14.s\n"
+ "tbz x13, #1, 14f\n"
+ "tbz x13, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c578 // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0840700 // mova za0h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa042c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c564 // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c55c // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+ "addvl x11, x11, #16\n"
+ ".inst 0xa061c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ ".inst 0xa062c558 // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c54c // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "blt 11b\n"
+ "b 21f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc0860420 // mova { z0.s-z3.s }, za1h.s[x12]\n"
+ ".inst 0xa060c55c // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa061c540 // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c548 // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+ ".inst 0xa063c550 // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "blt 13b\n"
+ "b 21f\n"
+ "14:" // Store to output array
+ "ldr x23, [%x[args], %[offsetof_C]]\n"
+ "add x23, x23, x27\n" // C += n
+ "sub x22, x9, x28\n"
+ "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ldr x21, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x23, x28, x21, x23\n" // C += m * ldc
+ "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "tbz x13, #2, 15f\n"
+ "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+ "add x20, x20, x27\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa040c26c // ld1w { z12.s-z15.s }, p8/Z, [x19]\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa040c264 // ld1w { z4.s-z7.s }, p8/Z, [x19]\n"
+ "15:" // Store to output array: Load per-channel parameters: End
+ "cntw x19\n"
+ "whilelt p0.b, x27, x26\n"
+ "cmp x22, x19\n"
+ "csel x19, x22, x19, LT\n"
+ "lsr x20, x19, #0x1\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x1\n"
+ "cbz x20, 17f\n"
+ "16:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086001a // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc086005c // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+ ".inst 0xc1aca41a // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+ ".inst 0xc0860096 // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600d0 // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1ada41c // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+ ".inst 0xc1aea416 // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+ "add x12, x12, #0x2\n"
+ "cmp x12, x20, LSL #1\n"
+ ".inst 0xc1afa410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+ ".inst 0xc1a4a23a // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+ ".inst 0xc1a5a23c // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+ ".inst 0xc1a6a236 // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
+ ".inst 0xc1a7a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+ ".inst 0xc1a1a31a // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
+ ".inst 0xc1a1a31c // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
+ ".inst 0xc1a1a316 // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
+ ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+ ".inst 0xc1b4c6ba // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6bc // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+ "uzp1 z19.b, z26.b, z28.b\n"
+ ".inst 0xc1b4c6b6 // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z22.b, z16.b\n"
+ "uzp1 z18.b, z27.b, z29.b\n"
+ "uzp1 z17.b, z23.b, z17.b\n"
+ "uzp1 z16.b, z19.b, z16.b\n"
+ "st1b { z16.b }, p0, [x23]\n"
+ "add x23, x23, x21\n"
+ "uzp1 z16.b, z18.b, z17.b\n"
+ "st1b { z16.b }, p0, [x23]\n"
+ "add x23, x23, x21\n"
+ "blt 16b\n"
+ "17:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 18f\n"
+ ".inst 0xc0860002 // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+ ".inst 0xc0860058 // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
+ ".inst 0xc1aca402 // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
+ ".inst 0xc0860090 // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
+ ".inst 0xc08600ca // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
+ ".inst 0xc1ada418 // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+ ".inst 0xc1aea410 // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
+ ".inst 0xc1afa40a // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
+ ".inst 0xc1a4a222 // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
+ ".inst 0xc1a5a238 // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+ ".inst 0xc1a6a230 // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
+ ".inst 0xc1a7a22a // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
+ ".inst 0xc1a1a302 // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
+ ".inst 0xc1a1a318 // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
+ ".inst 0xc1a1a310 // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+ ".inst 0xc1a1a30a // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
+ ".inst 0xc1b4c6a2 // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6b8 // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+ "uzp1 z23.b, z2.b, z24.b\n"
+ ".inst 0xc1b4c6b0 // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+ ".inst 0xc1b4c6aa // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+ "uzp1 z16.b, z16.b, z10.b\n"
+ "uzp1 z16.b, z23.b, z16.b\n"
+ "st1b { z16.b }, p0, [x23]\n"
+ "18:" // Store to output array: Accumulator row 0 oddments: End
+ "19:" // Store to output array: End
+ "tbz x13, #0, 21f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "20:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c570 // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c56c // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x11, x11, #16\n"
+ "blt 20b\n"
+ "21:" // End block
+ "incw x27, ALL, MUL #4\n"
+ "cmp x27, x26\n"
+ "blt 3b\n"
+ "incw x28\n"
+ "cmp x28, x9\n"
+ "mov x27, #0x0\n"
+ "mov x25, x24\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..123405bd17
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint8_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<uint32_t>() * 2;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<uint32_t>() * 2;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_2VLx2VL;
+
+ StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
+
+ cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..6c42012482
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const uint8_t *const A,
+ const uint8_t *const B,
+ uint8_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+ const Requantize32 &rq,
+ const int n_0,
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+ C(C), ldcb(ldc * sizeof(uint8_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias), n_0(n_0),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (rq.per_channel_requant)
+ {
+ flags |= 1 << 2; // PER_CHANNEL_QUANTISATION
+ }
+ }
+
+ const uint8_t *const A;
+ const uint8_t *const B;
+ const long kstride_bytes;
+ uint8_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<uint8_t>::min();
+ int32_t max = std::numeric_limits<uint8_t>::max();
+
+ const int32_t *const bias;
+ const int n_0;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207811 // ptrue pn9.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840400 // mova za0h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa041c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c5c0 // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840402 // mova za2h.s[x12], { z0.s-z3.s }\n"
+ ".inst 0xa043c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840783 // mova za3h.s[x12], { z28.s-z31.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ ".inst 0x25bc4530 // whilelt pn8.s, x9, x28, VLx2\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ ".inst 0xa0094275 // ldnt1w { z20.s-z21.s }, p8/Z, [x19, x9, LSL #2]\n"
+ ".inst 0xc0902680 // addha za0.s, p1/M, p1/M, z20.s\n"
+ ".inst 0xc09026a1 // addha za1.s, p1/M, p1/M, z21.s\n"
+ ".inst 0xc0902682 // addha za2.s, p1/M, p1/M, z20.s\n"
+ ".inst 0xc09026a3 // addha za3.s, p1/M, p1/M, z21.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19, ALL, MUL #2\n"
+ "incw x20, ALL, MUL #2\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa040075e // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+ ".inst 0xa04006d1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa041074e // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa04106c9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa0420740 // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa14206dc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa0430744 // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14306ca // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+ ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+ ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ ".inst 0xa040075e // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+ ".inst 0xa1a825c0 // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+ ".inst 0xa04006d1 // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+ ".inst 0xa1a925c1 // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+ ".inst 0xa1a825e2 // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+ ".inst 0xa1a925e3 // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+ ".inst 0xa041074e // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+ ".inst 0xa1b42400 // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+ ".inst 0xa04106c9 // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1bc2401 // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+ ".inst 0xa1b42422 // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+ ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+ ".inst 0xa0420740 // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa14206dc // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xa1a22480 // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+ ".inst 0xa1aa2481 // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+ ".inst 0xa1a224a2 // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+ ".inst 0xa1aa24a3 // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+ ".inst 0xa0430744 // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+ "addvl x26, x26, #8\n"
+ ".inst 0xa14306ca // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+ ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+ ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ ".inst 0xa1a825c0 // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+ ".inst 0xa1a925c1 // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+ ".inst 0xa1a825e2 // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+ ".inst 0xa1a925e3 // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+ ".inst 0xa1b42400 // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+ ".inst 0xa1bc2401 // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+ ".inst 0xa1b42422 // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+ ".inst 0xa1bc2423 // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+ ".inst 0xa1a22480 // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+ ".inst 0xa1aa2481 // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+ ".inst 0xa1a224a2 // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+ ".inst 0xa1aa24a3 // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa040075e // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xa04006d0 // ld1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+ "addvl x22, x22, #2\n"
+ ".inst 0xa1b027c0 // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+ ".inst 0xa1b127c1 // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+ ".inst 0xa1b027e2 // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+ ".inst 0xa1b127e3 // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ ".inst 0xa040474e // ld1w { z14.s-z15.s }, pn9.b/Z, [x26]\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0xc09125c0 // addva za0.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125c1 // addva za1.s, p1/M, p1/M, z14.s\n"
+ ".inst 0xc09125e2 // addva za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09125e3 // addva za3.s, p1/M, p1/M, z15.s\n"
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c5dc // ld1w { z28.s-z31.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0860408 // mova { z8.s-z11.s }, za0h.s[x12]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xc0860434 // mova { z20.s-z23.s }, za1h.s[x12]\n"
+ ".inst 0xa041c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840701 // mova za1h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xc086045c // mova { z28.s-z31.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xa042c5d8 // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840702 // mova za2h.s[x12], { z24.s-z27.s }\n"
+ ".inst 0xa043c5cc // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840583 // mova za3h.s[x12], { z12.s-z15.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c5b4 // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c5bc // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 24f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0860424 // mova { z4.s-z7.s }, za1h.s[x12]\n"
+ ".inst 0xa060c5b0 // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+ ".inst 0xc0860448 // mova { z8.s-z11.s }, za2h.s[x12]\n"
+ ".inst 0xc086046c // mova { z12.s-z15.s }, za3h.s[x12]\n"
+ ".inst 0xa061c5a4 // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c5a8 // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c5ac // st1w { z12.s-z15.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 24f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9\n" // C += n
+ "sub x24, x11, x10\n"
+ "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "tbz x15, #2, 15f\n"
+ "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+ "add x20, x20, x9\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa0404262 // ld1w { z2.s-z3.s }, p8/Z, [x19]\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ ".inst 0xa0404260 // ld1w { z0.s-z1.s }, p8/Z, [x19]\n"
+ "15:" // Store to output array: Load per-channel parameters: End
+ "cntw x22\n"
+ "whilelt p0.h, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 17f\n"
+ "16:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+ ".inst 0xc1a3ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a0aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc1a1aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+ ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+ ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z12.h, z28.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z13.h, z29.h\n"
+ "uzp1 z17.h, z14.h, z30.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z15.h, z31.h\n"
+ "st1b { z17.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 16b\n"
+ "17:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 18f\n"
+ ".inst 0xc086041c // mova { z28.s-z31.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xc1a2ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+ ".inst 0xc1a3ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a0aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc1a1aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1abab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+ ".inst 0xc1abab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+ ".inst 0xc1b8cf3c // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf2c // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z28.h, z12.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "subs x19, x19, #0x1\n"
+ "uzp1 z16.h, z29.h, z13.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "uzp1 z16.h, z30.h, z14.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "18:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 22f\n"
+ "whilelt p0.h, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 20f\n"
+ "19:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860444 // mova { z4.s-z7.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+ ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a0aa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1abab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
+ ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1b8cf24 // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z4.h, z16.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z5.h, z17.h\n"
+ "uzp1 z17.h, z6.h, z18.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "uzp1 z16.h, z7.h, z19.h\n"
+ "st1b { z17.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 19b\n"
+ "20:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 21f\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860470 // mova { z16.s-z19.s }, za3h.s[x12]\n"
+ ".inst 0xc1a2ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
+ ".inst 0xc1a3ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a0aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
+ ".inst 0xc1a1aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+ ".inst 0xc1abab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+ ".inst 0xc1abab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+ ".inst 0xc1b8cf34 // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+ ".inst 0xc1b8cf30 // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+ "uzp1 z16.h, z20.h, z16.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "subs x19, x19, #0x1\n"
+ "uzp1 z16.h, z21.h, z17.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "uzp1 z16.h, z22.h, z18.h\n"
+ "st1b { z16.h }, p0, [x25]\n"
+ "21:" // Store to output array: Accumulator row 1 oddments: End
+ "22:" // Store to output array: End
+ "tbz x15, #0, 24f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "23:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+ ".inst 0xc0840600 // mova za0h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa041c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c5d0 // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c5c4 // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 23b\n"
+ "24:" // End block
+ "incw x9, ALL, MUL #2\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #2\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..2e61cf49a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint8_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return sme::get_vector_length<uint32_t>() * 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return sme::get_vector_length<uint32_t>() * 1;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ static constexpr bool supports_bias()
+ {
+ return true;
+ }
+
+ static constexpr bool supports_activation()
+ {
+ return false;
+ }
+
+ static constexpr bool is_sme()
+ {
+ return true;
+ }
+
+ // Default to the generic kernel
+ kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_4VLx1VL;
+
+ StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
+
+ cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *ci)
+ {
+ ARM_COMPUTE_UNUSED(ci);
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..40d2fff8c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+ struct KernelArgs
+ {
+ KernelArgs(
+ const uint8_t *const A,
+ const uint8_t *const B,
+ uint8_t *const C, const int ldc,
+ const int M, const int N, const int K,
+ const int32_t *const bias,
+ const Requantize32 &rq,
+ const int n_0,
+ bool accumulate,
+ int32_t *const accumulator_buffer
+ ) : A(A),
+ B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+ C(C), ldcb(ldc * sizeof(uint8_t)),
+ M(M), N(N), K(K),
+ n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+ bias(bias), n_0(n_0),
+ accumulator_buffer(accumulator_buffer),
+ flags(0x0)
+ {
+ if (accumulate)
+ {
+ flags |= 1 << 0; // FILL_ACCUMULATORS_FROM_BUFFER
+ }
+ if (C == nullptr)
+ {
+ flags |= 1 << 1; // STORE_ACCUMULATORS_TO_BUFFER
+ }
+ if (rq.per_channel_requant)
+ {
+ flags |= 1 << 2; // PER_CHANNEL_QUANTISATION
+ }
+ }
+
+ const uint8_t *const A;
+ const uint8_t *const B;
+ const long kstride_bytes;
+ uint8_t *const C;
+ const long ldcb;
+ const long M, N, K, n_loops, n_tail_iters;
+ int32_t min = std::numeric_limits<uint8_t>::min();
+ int32_t max = std::numeric_limits<uint8_t>::max();
+
+ const int32_t *const bias;
+ const int n_0;
+
+ int32_t *const accumulator_buffer;
+ uint64_t flags;
+ };
+
+ // Construct arguments for this kernel
+ KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+ __asm__ __volatile__(
+ "ldr x15, [%x[args], %[offsetof_flags]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p1.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+ "tbz x15, #0, 2f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "1:" // Initial accumulator load from buffer: Loop
+ ".inst 0xa040c1dc // ld1w { z28.s-z31.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840780 // mova za0h.s[x12], { z28.s-z31.s }\n"
+ ".inst 0xa041c1cc // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840581 // mova za1h.s[x12], { z12.s-z15.s }\n"
+ ".inst 0xa042c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840682 // mova za2h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xa043c1d8 // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840703 // mova za3h.s[x12], { z24.s-z27.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 1b\n"
+ "2:" // Initial accumulator load from buffer: End
+ "ldr w11, [%x[args], %[offsetof_M]]\n"
+ "mov x10, #0x0\n"
+ "mov x9, #0x0\n"
+ "ldr w28, [%x[args], %[offsetof_N]]\n"
+ "ldr x27, [%x[args], %[offsetof_A]]\n"
+ "3:" // M and N loop
+ "mov x26, x27\n"
+ "whilelt p0.s, x9, x28\n"
+ "tbnz x15, #0, 4f\n"
+ "ldr x19, [%x[args], %[offsetof_bias]]\n"
+ ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+ "cbz x19, 5f\n"
+ "ldnt1w { z15.s }, p0/Z, [x19, x9, LSL #2]\n"
+ ".inst 0xc09025e0 // addha za0.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e1 // addha za1.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e2 // addha za2.s, p1/M, p1/M, z15.s\n"
+ ".inst 0xc09025e3 // addha za3.s, p1/M, p1/M, z15.s\n"
+ "4:" // Prepare accumulators: Test for last block
+ "mov x19, x9\n"
+ "mov x20, x10\n"
+ "incw x19\n"
+ "incw x20, ALL, MUL #4\n"
+ "cmp x19, x28\n"
+ "csel x20, x10, x20, LT\n"
+ "mov x19, x15\n"
+ "bfm x15, XZR, #0x0, #0x0 // bfc x15, #0x0, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x15, x19, x15, LT\n"
+ "5:" // Prepare accumulators: End
+ "ldr x19, [%x[args], %[offsetof_K]]\n"
+ "add x19, x19, #0x3\n"
+ "lsr x19, x19, #0x2\n"
+ "ldr x22, [%x[args], %[offsetof_B]]\n"
+ "lsr x21, x19, #0x2\n"
+ "and x20, x19, #0x3\n"
+ "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+ "madd x22, x9, x19, x22\n" // bptr = B + n * kstride_bytes
+ "cbz x21, 8f\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1408352 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+ "ldnt1b { z0.b }, p1/Z, [x22]\n"
+ ".inst 0xa1418353 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa1428350 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa1438342 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "ble 7f\n"
+ "6:" // K loop
+ ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+ ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+ ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ ".inst 0xa1408352 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+ ".inst 0xa1a92660 // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+ "ldnt1b { z0.b }, p1/Z, [x22]\n"
+ ".inst 0xa1a926e1 // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+ ".inst 0xa1a92762 // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+ ".inst 0xa1a927e3 // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+ ".inst 0xa1418353 // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+ ".inst 0xa1b52600 // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+ "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+ ".inst 0xa1b52681 // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+ ".inst 0xa1b52702 // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+ ".inst 0xa1b52783 // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+ ".inst 0xa1428350 // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+ "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xa1ac2440 // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+ ".inst 0xa1ac24c1 // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+ ".inst 0xa1ac2542 // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+ ".inst 0xa1ac25c3 // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+ ".inst 0xa1438342 // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+ "addvl x26, x26, #16\n"
+ "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "bgt 6b\n"
+ "7:" // K loop tail
+ ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+ ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+ ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ ".inst 0xa1a92660 // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+ ".inst 0xa1a926e1 // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+ ".inst 0xa1a92762 // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+ ".inst 0xa1a927e3 // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+ ".inst 0xa1b52600 // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+ ".inst 0xa1b52681 // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+ ".inst 0xa1b52702 // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+ ".inst 0xa1b52783 // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+ ".inst 0xa1ac2440 // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+ ".inst 0xa1ac24c1 // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+ ".inst 0xa1ac2542 // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+ ".inst 0xa1ac25c3 // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+ "8:" // K oddments
+ "cbz x20, 10f\n"
+ "9:" // K oddments: Loop
+ ".inst 0xa1408352 // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+ "subs x20, x20, #0x1\n"
+ "addvl x26, x26, #4\n"
+ "ld1b { z0.b }, p1/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ ".inst 0xa1a02640 // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+ ".inst 0xa1a026c1 // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+ ".inst 0xa1a02742 // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+ ".inst 0xa1a027c3 // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+ "bgt 9b\n"
+ "10:" // K oddments: End
+ ".inst 0xa040c340 // ld1w { z0.s-z3.s }, pn8.b/Z, [x26]\n"
+ "addvl x26, x26, #4\n"
+ ".inst 0xc0912400 // addva za0.s, p1/M, p1/M, z0.s\n"
+ ".inst 0xc0912421 // addva za1.s, p1/M, p1/M, z1.s\n"
+ ".inst 0xc0912442 // addva za2.s, p1/M, p1/M, z2.s\n"
+ ".inst 0xc0912463 // addva za3.s, p1/M, p1/M, z3.s\n"
+ "tbz x15, #1, 14f\n"
+ "tbz x15, #0, 12f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "11:" // Store to partial result buffer: Store and refill: Loop
+ ".inst 0xa040c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc0840680 // mova za0h.s[x12], { z20.s-z23.s }\n"
+ ".inst 0xc0860428 // mova { z8.s-z11.s }, za1h.s[x12]\n"
+ ".inst 0xa041c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840481 // mova za1h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc086047c // mova { z28.s-z31.s }, za3h.s[x12]\n"
+ ".inst 0xa042c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840482 // mova za2h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa043c1d4 // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840683 // mova za3h.s[x12], { z20.s-z23.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa060c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+ "addvl x14, x14, #16\n"
+ ".inst 0xa061c1a8 // st1w { z8.s-z11.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ ".inst 0xa062c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1bc // st1w { z28.s-z31.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 11b\n"
+ "b 30f\n"
+ "12:" // Store to partial result buffer: Store only
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "13:" // Store to partial result buffer: Store only: Loop
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc086042c // mova { z12.s-z15.s }, za1h.s[x12]\n"
+ ".inst 0xa060c1b0 // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+ ".inst 0xc0860454 // mova { z20.s-z23.s }, za2h.s[x12]\n"
+ ".inst 0xc0860478 // mova { z24.s-z27.s }, za3h.s[x12]\n"
+ ".inst 0xa061c1ac // st1w { z12.s-z15.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ ".inst 0xa062c1b4 // st1w { z20.s-z23.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+ ".inst 0xa063c1b8 // st1w { z24.s-z27.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "blt 13b\n"
+ "b 30f\n"
+ "14:" // Store to output array
+ "ldr x25, [%x[args], %[offsetof_C]]\n"
+ "add x25, x25, x9\n" // C += n
+ "sub x24, x11, x10\n"
+ "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+ "madd x25, x10, x23, x25\n" // C += m * ldc
+ "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+ "tbz x15, #2, 15f\n"
+ "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+ "add x20, x20, x9\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ "ld1w { z8.s }, p0/Z, [x19]\n"
+ "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "add x19, x19, x20, LSL #2\n"
+ "ld1w { z7.s }, p0/Z, [x19]\n"
+ "15:" // Store to output array: Load per-channel parameters: End
+ "cntw x22\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 17f\n"
+ "16:" // Store to output array: Accumulator row 0 loop
+ ".inst 0xc086040c // mova { z12.s-z15.s }, za0h.s[x12]\n"
+ ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+ "st1b { z12.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z13.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z14.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z15.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 16b\n"
+ "17:" // Store to output array: Accumulator row 0 oddments
+ "cbz x19, 18f\n"
+ ".inst 0xc0860410 // mova { z16.s-z19.s }, za0h.s[x12]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+ "st1b { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 18f\n"
+ "st1b { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "18:" // Store to output array: Accumulator row 0 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 20f\n"
+ "19:" // Store to output array: Accumulator row 1 loop
+ ".inst 0xc0860430 // mova { z16.s-z19.s }, za1h.s[x12]\n"
+ ".inst 0xc1a8ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a4ccb0 // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+ "st1b { z16.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z17.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z18.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z19.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 19b\n"
+ "20:" // Store to output array: Accumulator row 1 oddments
+ "cbz x19, 21f\n"
+ ".inst 0xc086043c // mova { z28.s-z31.s }, za1h.s[x12]\n"
+ ".inst 0xc1a8ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+ ".inst 0xc1a6ab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ ".inst 0xc1a4ccbc // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
+ "st1b { z28.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z29.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 21f\n"
+ "st1b { z30.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "21:" // Store to output array: Accumulator row 1 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x21, x24, x22, LT\n"
+ "lsr x20, x21, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x21, #0x3\n"
+ "cbz x20, 23f\n"
+ "22:" // Store to output array: Accumulator row 2 loop
+ ".inst 0xc0860458 // mova { z24.s-z27.s }, za2h.s[x12]\n"
+ ".inst 0xc1a8ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ ".inst 0xc1a4ccb8 // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
+ "st1b { z24.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z25.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z26.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z27.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 22b\n"
+ "23:" // Store to output array: Accumulator row 2 oddments
+ "cbz x19, 24f\n"
+ ".inst 0xc086044c // mova { z12.s-z15.s }, za2h.s[x12]\n"
+ ".inst 0xc1a8ac0c // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa2c // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+ ".inst 0xc1a6ab0c // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+ ".inst 0xc1a4ccac // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+ "st1b { z12.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z13.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 24f\n"
+ "st1b { z14.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "24:" // Store to output array: Accumulator row 2 oddments: End
+ "subs x24, x24, x21\n"
+ "beq 28f\n"
+ "whilelt p0.s, x9, x28\n"
+ "cmp x24, x22\n"
+ "csel x19, x24, x22, LT\n"
+ "lsr x20, x19, #0x2\n"
+ "mov x12, #0x0\n"
+ "and x19, x19, #0x3\n"
+ "cbz x20, 26f\n"
+ "25:" // Store to output array: Accumulator row 3 loop
+ ".inst 0xc0860474 // mova { z20.s-z23.s }, za3h.s[x12]\n"
+ ".inst 0xc1a8ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+ "add x12, x12, #0x4\n"
+ ".inst 0xc1a7aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ "cmp x12, x20, LSL #2\n"
+ ".inst 0xc1a6ab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+ ".inst 0xc1a4ccb4 // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
+ "st1b { z20.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z21.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z22.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "st1b { z23.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "blt 25b\n"
+ "26:" // Store to output array: Accumulator row 3 oddments
+ "cbz x19, 27f\n"
+ ".inst 0xc0860460 // mova { z0.s-z3.s }, za3h.s[x12]\n"
+ ".inst 0xc1a8ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+ "subs x19, x19, #0x1\n"
+ ".inst 0xc1a7aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
+ ".inst 0xc1a6ab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ ".inst 0xc1a4cca0 // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
+ "st1b { z0.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ "subs x19, x19, #0x1\n"
+ "st1b { z1.s }, p0, [x25]\n"
+ "add x25, x25, x23\n"
+ "beq 27f\n"
+ "st1b { z2.s }, p0, [x25]\n"
+ "27:" // Store to output array: Accumulator row 3 oddments: End
+ "28:" // Store to output array: End
+ "tbz x15, #0, 30f\n"
+ "mov x12, #0x0\n"
+ "cntw x19\n"
+ "29:" // Store to output array: Refill accumulators: Loop
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ ".inst 0xc0840480 // mova za0h.s[x12], { z4.s-z7.s }\n"
+ ".inst 0xa041c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+ ".inst 0xc0840601 // mova za1h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa042c1d0 // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+ ".inst 0xc0840602 // mova za2h.s[x12], { z16.s-z19.s }\n"
+ ".inst 0xa043c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+ ".inst 0xc0840483 // mova za3h.s[x12], { z4.s-z7.s }\n"
+ "add x12, x12, #0x4\n"
+ "cmp x12, x19\n"
+ "addvl x14, x14, #16\n"
+ "blt 29b\n"
+ "30:" // End block
+ "incw x9\n"
+ "cmp x9, x28\n"
+ "blt 3b\n"
+ "incw x10, ALL, MUL #4\n"
+ "cmp x10, x11\n"
+ "mov x9, #0x0\n"
+ "mov x27, x26\n"
+ "blt 3b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE