Integrate SME2 kernels

* Add SME/SME2 detection. * Integrate SME2 implementation for: - Normal convolution - Winograd - Depthwise convolution - Pooling Resolves: COMPMID-5700 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I2f1ca1d05f8cfeee9309ed1c0a36096a4a6aad5c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8692 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2022-06-01 11:47:14 +0100
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2022-11-28 16:57:42 +0000
commit: 03b2971ac69a86f10a1566938d1a25afee15746c (patch)
tree: aec7cfc047e1da278b4b71a706cda7b1b0faa158 /src/core/NEON/kernels/arm_gemm/kernels
parent: 7dc0234331f2150a6b4ac5c2b49de419870f7cf5 (diff)
download: ComputeLibrary-03b2971ac69a86f10a1566938d1a25afee15746c.tar.gz
40 files changed, 11865 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
new file mode 100644
index 0000000000..f86bcebe64
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, size_t, size_t, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_bf16fp32_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_bf16fp32_dot_16VL
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_bf16fp32_dot_16VL;
+    cls_sme2_gemv_bf16fp32_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..26861fb931
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_bf16fp32_dot_16VL/generic.cpp
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_bf16fp32_dot_16VL (
+    const bfloat16 *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x27, ALL, MUL #4\n"
+      "add x26, %x[N], x27\n"
+      "sub x26, x26, #0x1\n"
+      "udiv x26, x26, x27\n"
+      "add x21, x26, #0x3\n"
+      "and x21, x21, #0xfffffffffffffffc\n"
+      "mul x21, x21, x27\n"
+      "mul x21, x21, %x[K]\n"
+      "mov x9, #0x0\n"
+      "mov x25, %x[B_ptr]\n"
+      "mov x24, %x[output_ptr]\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "lsl x21, x21, #0x1\n"
+      "mov x20, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x21, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x21, #0, 3f\n"
+      "lsr x21, x21, #0x1\n"
+      "lsl x20, x20, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x19, x21, #0x26\n"
+      "sub x20, x20, #0x1\n"
+      "lsl x20, x20, #0x16\n"
+      "orr x21, x21, x19\n"
+      "orr x21, x21, x20\n"
+      ".inst 0xf8b54b3a  // rprfm pldonce, x21, [x25]\n"
+      "3:"  // RHS prefetch exit
+      "mov x23, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x26, #0x4\n"
+      "bge 28f\n"
+      "cmp x26, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "mov x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 5f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x20, #0x8\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x8\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "cmp x20, #0x8\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xa060c308  // st1w { z8.s-z11.s }, p8, [x24]\n"
+      "addvl x24, x24, #4\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c308  // st1w { z8.s-z11.s }, p8, [x24]\n"
+      "addvl x24, x24, #4\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "sub x19, %x[N], x27\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 13f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x20, #0x8\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x8\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      "cmp x20, #0x8\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa041a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa041a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+      ".inst 0xa061c314  // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c314  // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x19, #0x2\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 21f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xa042c6e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x20, #0x8\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x8\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      "cmp x20, #0x8\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      ".inst 0xa042a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      ".inst 0xa042a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      ".inst 0xa042a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa041a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      ".inst 0xa042a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      ".inst 0xa042a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      ".inst 0xa042a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa041a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+      ".inst 0xa062c310  // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+      "addvl x24, x24, #12\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c310  // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+      "addvl x24, x24, #12\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x19, #0x3\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x1\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 29f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xa042c6e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      ".inst 0xa043c6f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x23, x23, #16\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x20, #0x8\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x8\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      "cmp x20, #0x8\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      ".inst 0xa042a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+      ".inst 0xa043a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15ab21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      ".inst 0xa042a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa043a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15ab61b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      ".inst 0xa042a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+      ".inst 0xa043a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa041a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa043a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15abe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.h, XZR, x20\n"
+      "ld1rqh { z10.h }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xa040a721  // ldnt1h { z0.h-z3.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15ab018  // bfdot za.s[x9, 0], { z0.h-z3.h }, z10.h[0]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab099  // bfdot za.s[x9, 1], { z4.h-z7.h }, z10.h[0]\n"
+      ".inst 0xa042a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab29a  // bfdot za.s[x9, 2], { z20.h-z23.h }, z10.h[0]\n"
+      ".inst 0xa043a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15ab21b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab718  // bfdot za.s[x9, 0], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15ab619  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[1]\n"
+      ".inst 0xa042a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15ab71a  // bfdot za.s[x9, 2], { z24.h-z27.h }, z10.h[1]\n"
+      ".inst 0xa043a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15ab61b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc15ab998  // bfdot za.s[x9, 0], { z12.h-z15.h }, z10.h[2]\n"
+      ".inst 0xa041a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aba19  // bfdot za.s[x9, 1], { z16.h-z19.h }, z10.h[2]\n"
+      ".inst 0xa042a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abb9a  // bfdot za.s[x9, 2], { z28.h-z31.h }, z10.h[2]\n"
+      ".inst 0xa043a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aba9b  // bfdot za.s[x9, 3], { z20.h-z23.h }, z10.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15abe18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa041a739  // ldnt1h { z24.h-z27.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15abf19  // bfdot za.s[x9, 1], { z24.h-z27.h }, z10.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15abe1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z10.h[3]\n"
+      ".inst 0xa043a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15abe1b  // bfdot za.s[x9, 3], { z16.h-z19.h }, z10.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+      ".inst 0xa062c710  // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc1a6c818  // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
+      ".inst 0xa063c318  // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c710  // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c318  // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      "35:"  // Width 4: Output done
+      "subs x26, x26, #0x4\n"
+      "sub %x[N], %x[N], x27, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p1.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
new file mode 100644
index 0000000000..f33cb9a33d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const float *, const float *, \
+    float *, size_t, size_t, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp32_mla_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp32_mla_16VL
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 1> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_fp32_mla_16VL;
+    cls_sme2_gemv_fp32_mla_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
new file mode 100644
index 0000000000..4c0ae2c6bd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32_mla_16VL/generic.cpp
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp32_mla_16VL (
+    const float *A_ptr, const float *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x27, ALL, MUL #4\n"
+      "add x26, %x[N], x27\n"
+      "sub x26, x26, #0x1\n"
+      "udiv x26, x26, x27\n"
+      "add x21, x26, #0x3\n"
+      "and x21, x21, #0xfffffffffffffffc\n"
+      "mul x21, x21, x27\n"
+      "mul x21, x21, %x[K]\n"
+      "mov x9, #0x0\n"
+      "mov x25, %x[B_ptr]\n"
+      "mov x24, %x[output_ptr]\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "lsl x21, x21, #0x2\n"
+      "mov x20, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x21, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x21, #0, 3f\n"
+      "lsr x21, x21, #0x1\n"
+      "lsl x20, x20, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x19, x21, #0x26\n"
+      "sub x20, x20, #0x1\n"
+      "lsl x20, x20, #0x16\n"
+      "orr x21, x21, x19\n"
+      "orr x21, x21, x20\n"
+      ".inst 0xf8b54b3a  // rprfm pldonce, x21, [x25]\n"
+      "3:"  // RHS prefetch exit
+      "mov x23, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x26, #0x4\n"
+      "bge 28f\n"
+      "cmp x26, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "mov x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 5f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x20, #0x4\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x4\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      "cmp x20, #0x4\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xa060c308  // st1w { z8.s-z11.s }, p8, [x24]\n"
+      "addvl x24, x24, #4\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c308  // st1w { z8.s-z11.s }, p8, [x24]\n"
+      "addvl x24, x24, #4\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "sub x19, %x[N], x27\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 13f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x20, #0x4\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x4\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      "cmp x20, #0x4\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa041c725  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa041c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      ".inst 0xa041c725  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa041c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+      ".inst 0xa061c314  // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c314  // st1w { z20.s-z23.s }, p8, [x24, #0x4, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x19, #0x2\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 21f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xa042c6e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x20, #0x4\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x4\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      "cmp x20, #0x4\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa041c725  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      ".inst 0xa042c735  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      ".inst 0xa042c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      ".inst 0xa042c73d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa041c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xa042c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      ".inst 0xa041c725  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      ".inst 0xa042c735  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      ".inst 0xa042c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      ".inst 0xa042c73d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa041c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xa042c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+      ".inst 0xa062c310  // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+      "addvl x24, x24, #12\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c310  // st1w { z16.s-z19.s }, p8, [x24, #0x8, MUL VL]\n"
+      "addvl x24, x24, #12\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x19, #0x3\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 29f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6e8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042d01  // mova za.d[x9, #1], { z8.d-z11.d }\n"
+      ".inst 0xa042c6e4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042c82  // mova za.d[x9, #2], { z4.d-z7.d }\n"
+      ".inst 0xa043c6f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x23, x23, #16\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x20, #0x4\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "sub x20, x20, #0x4\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      "cmp x20, #0x4\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa041c725  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      ".inst 0xa042c735  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+      ".inst 0xa043c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aa203  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      ".inst 0xa042c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa043c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aa603  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      ".inst 0xa042c73d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+      ".inst 0xa043c735  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aaa83  // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa041c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xa042c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa043c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aae03  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x20\n"
+      "ld1rqw { z10.s }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xa040c721  // ldnt1w { z0.s-z3.s }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc15aa000  // fmla za.s[x9, 0], { z0.s-z3.s }, z10.s[0]\n"
+      ".inst 0xa041c725  // ldnt1w { z4.s-z7.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa081  // fmla za.s[x9, 1], { z4.s-z7.s }, z10.s[0]\n"
+      ".inst 0xa042c735  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa282  // fmla za.s[x9, 2], { z20.s-z23.s }, z10.s[0]\n"
+      ".inst 0xa043c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aa203  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa700  // fmla za.s[x9, 0], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aa601  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[1]\n"
+      ".inst 0xa042c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aa702  // fmla za.s[x9, 2], { z24.s-z27.s }, z10.s[1]\n"
+      ".inst 0xa043c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aa603  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c72d  // ldnt1w { z12.s-z15.s }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      ".inst 0xc15aa980  // fmla za.s[x9, 0], { z12.s-z15.s }, z10.s[2]\n"
+      ".inst 0xa041c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaa01  // fmla za.s[x9, 1], { z16.s-z19.s }, z10.s[2]\n"
+      ".inst 0xa042c73d  // ldnt1w { z28.s-z31.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aab82  // fmla za.s[x9, 2], { z28.s-z31.s }, z10.s[2]\n"
+      ".inst 0xa043c735  // ldnt1w { z20.s-z23.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aaa83  // fmla za.s[x9, 3], { z20.s-z23.s }, z10.s[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25]\n"
+      ".inst 0xc15aae00  // fmla za.s[x9, 0], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa041c739  // ldnt1w { z24.s-z27.s }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc15aaf01  // fmla za.s[x9, 1], { z24.s-z27.s }, z10.s[3]\n"
+      ".inst 0xa042c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc15aae02  // fmla za.s[x9, 2], { z16.s-z19.s }, z10.s[3]\n"
+      ".inst 0xa043c731  // ldnt1w { z16.s-z19.s }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc15aae03  // fmla za.s[x9, 3], { z16.s-z19.s }, z10.s[3]\n"
+      "addvl x25, x25, #16\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z0.s }, p1/Z, [x20]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      "ld1rw { z6.s }, p1/Z, [x19]\n"
+      ".inst 0xc1a6c808  // fclamp { z8.s-z11.s }, z0.s, z6.s\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1a6c814  // fclamp { z20.s-z23.s }, z0.s, z6.s\n"
+      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc1a6c810  // fclamp { z16.s-z19.s }, z0.s, z6.s\n"
+      ".inst 0xa062c710  // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc1a6c818  // fclamp { z24.s-z27.s }, z0.s, z6.s\n"
+      ".inst 0xa063c318  // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c34  // mova { z20.d-z23.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c714  // st1w { z20.s-z23.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0062c50  // mova { z16.d-z19.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c710  // st1w { z16.s-z19.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0062c78  // mova { z24.d-z27.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c318  // st1w { z24.s-z27.s }, p8, [x24, #0xc, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      "35:"  // Width 4: Output done
+      "subs x26, x26, #0x4\n"
+      "sub %x[N], %x[N], x27, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p1.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
new file mode 100644
index 0000000000..f52fbcd57f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+    const float *, const bfloat16 *, \
+    float *, size_t, size_t, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+void sme2_gemv_fp32bf16fp32_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_fp32bf16fp32_dot_16VL
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<float>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return true;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return true;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 2> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_fp32bf16fp32_dot_16VL;
+    cls_sme2_gemv_fp32bf16fp32_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..8b8bcb6bc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_fp32bf16fp32_dot_16VL/generic.cpp
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sme2_gemv_fp32bf16fp32_dot_16VL (
+    const float *A_ptr, const bfloat16 *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x28, ALL, MUL #4\n"
+      "add x27, %x[N], x28\n"
+      "sub x27, x27, #0x1\n"
+      "udiv x27, x27, x28\n"
+      "add x21, x27, #0x3\n"
+      "and x21, x21, #0xfffffffffffffffc\n"
+      "mul x21, x21, x28\n"
+      "mul x21, x21, %x[K]\n"
+      "mov x9, #0x0\n"
+      "mov x26, #0x4\n"
+      "mov x25, %x[B_ptr]\n"
+      "mov x24, %x[output_ptr]\n"
+      "ptrue p2.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "lsl x21, x21, #0x1\n"
+      "mov x20, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x21, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x21, #0, 3f\n"
+      "lsr x21, x21, #0x1\n"
+      "lsl x20, x20, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x19, x21, #0x26\n"
+      "sub x20, x20, #0x1\n"
+      "lsl x20, x20, #0x16\n"
+      "orr x21, x21, x19\n"
+      "orr x21, x21, x20\n"
+      ".inst 0xf8b54b3a  // rprfm pldonce, x21, [x25]\n"
+      "3:"  // RHS prefetch exit
+      "mov x23, %x[bias]\n"
+      "4:"  // Column loop
+      "cmp x27, #0x4\n"
+      "bge 28f\n"
+      "cmp x27, #0x2\n"
+      "bgt 20f\n"
+      "beq 12f\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "mov x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 5f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x20, #0x8\n"
+      "ble 8f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "sub x20, x20, #0x8\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "addvl x25, x25, #16\n"
+      "cmp x20, #0x8\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "addvl x25, x25, #16\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      "bgt 7b\n"
+      "8:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "subs x20, x20, #0x2\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 9f\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "9:"  // Width 1: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 10f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      "ld1rw { z18.s }, p2/Z, [x19]\n"
+      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+      ".inst 0xa060c308  // st1w { z8.s-z11.s }, p8, [x24]\n"
+      "addvl x24, x24, #4\n"
+      "b 11f\n"
+      "10:"  // Width 1: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c308  // st1w { z8.s-z11.s }, p8, [x24]\n"
+      "addvl x24, x24, #4\n"
+      "11:"  // Width 1: Output done
+      "b 36f\n"
+      "12:"  // Width 2
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "sub x19, %x[N], x28\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 13f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      "b 14f\n"
+      "13:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "14:"  // Width 2: setup done
+      "cmp x20, #0x8\n"
+      "ble 16f\n"
+      "15:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "sub x20, x20, #0x8\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "cmp x20, #0x8\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      "bgt 15b\n"
+      "16:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "subs x20, x20, #0x2\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      "ble 17f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa041a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 17f\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "17:"  // Width 2: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 18f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      "ld1rw { z18.s }, p2/Z, [x19]\n"
+      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1b2cbac  // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+      ".inst 0xa061c30c  // st1w { z12.s-z15.s }, p8, [x24, #0x4, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "b 19f\n"
+      "18:"  // Width 2: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c30c  // st1w { z12.s-z15.s }, p8, [x24, #0x4, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "19:"  // Width 2: Output done
+      "b 36f\n"
+      "20:"  // Width 3
+      "mov x19, #0x2\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x19, x28, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 21f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c6fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042f82  // mova za.d[x9, #2], { z28.d-z31.d }\n"
+      "b 22f\n"
+      "21:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "22:"  // Width 3: setup done
+      "cmp x20, #0x8\n"
+      "ble 24f\n"
+      "23:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "sub x20, x20, #0x8\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "cmp x20, #0x8\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa042a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      "bgt 23b\n"
+      "24:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "subs x20, x20, #0x2\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+      "ble 25f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      ".inst 0xa042a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa041a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 25f\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "25:"  // Width 3: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 26f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      "ld1rw { z18.s }, p2/Z, [x19]\n"
+      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1b2cbac  // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+      ".inst 0xa061c70c  // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc1b2cba4  // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
+      ".inst 0xa062c304  // st1w { z4.s-z7.s }, p8, [x24, #0x8, MUL VL]\n"
+      "addvl x24, x24, #12\n"
+      "b 27f\n"
+      "26:"  // Width 3: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c70c  // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c304  // st1w { z4.s-z7.s }, p8, [x24, #0x8, MUL VL]\n"
+      "addvl x24, x24, #12\n"
+      "27:"  // Width 3: Output done
+      "b 36f\n"
+      "28:"  // Width 4
+      "mov x19, #0x3\n"
+      "mov x22, %x[A_ptr]\n"
+      "lsl x21, %x[K], #0x2\n"
+      "msub x19, x28, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      ".inst 0x25b367f0  // whilelt p8.s, XZR, x19, VLx4\n"
+      "cbz x23, 29f\n"
+      ".inst 0xa040c6e0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x23]\n"
+      ".inst 0xc0042c00  // mova za.d[x9, #0], { z0.d-z3.d }\n"
+      ".inst 0xa041c6f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e01  // mova za.d[x9, #1], { z16.d-z19.d }\n"
+      ".inst 0xa042c6fc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042f82  // mova za.d[x9, #2], { z28.d-z31.d }\n"
+      ".inst 0xa043c6f0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x23, x23, #16\n"
+      "b 30f\n"
+      "29:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "30:"  // Width 4: setup done
+      "cmp x20, #0x8\n"
+      "ble 32f\n"
+      "31:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "sub x20, x20, #0x8\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "cmp x20, #0x8\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      ".inst 0xa043a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b39b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa042a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      ".inst 0xa043a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b79b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      ".inst 0xa043a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b99b  // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xa043a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150bf9b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+      "bgt 31b\n"
+      "32:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p1.s, XZR, x20\n"
+      "whilelt p0.s, x26, x20\n"
+      "ld1rqw { z0.s }, p1/Z, [x22]\n"
+      ".inst 0x658aa800  // bfcvt z0.h, p2/M, z0.s\n"
+      "ld1rqw { z11.s }, p0/Z, [x22, #16]\n"
+      ".inst 0x658aa96b  // bfcvt z11.h, p2/M, z11.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "subs x20, x20, #0x2\n"
+      "uzp1 z11.h, z11.h, z11.h\n"
+      "trn1 z0.d, z0.d, z11.d\n"
+      ".inst 0xa040a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25]\n"
+      "add x22, x22, #0x20\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b098  // bfdot za.s[x9, 0], { z4.h-z7.h }, z0.h[0]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b119  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[0]\n"
+      ".inst 0xa043a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150b21a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xc150b39b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[0]\n"
+      "ble 33f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150b618  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[1]\n"
+      ".inst 0xa041a725  // ldnt1h { z4.h-z7.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150b499  // bfdot za.s[x9, 1], { z4.h-z7.h }, z0.h[1]\n"
+      ".inst 0xa042a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150b59a  // bfdot za.s[x9, 2], { z12.h-z15.h }, z0.h[1]\n"
+      ".inst 0xa043a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150b79b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25]\n"
+      "subs x20, x20, #0x2\n"
+      ".inst 0xc150ba18  // bfdot za.s[x9, 0], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa041a735  // ldnt1h { z20.h-z23.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150ba99  // bfdot za.s[x9, 1], { z20.h-z23.h }, z0.h[2]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150ba1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[2]\n"
+      ".inst 0xa043a72d  // ldnt1h { z12.h-z15.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150b99b  // bfdot za.s[x9, 3], { z12.h-z15.h }, z0.h[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 33f\n"
+      ".inst 0xa040a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25]\n"
+      ".inst 0xc150bf98  // bfdot za.s[x9, 0], { z28.h-z31.h }, z0.h[3]\n"
+      ".inst 0xa041a729  // ldnt1h { z8.h-z11.h }, pn9.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc150bd19  // bfdot za.s[x9, 1], { z8.h-z11.h }, z0.h[3]\n"
+      ".inst 0xa042a731  // ldnt1h { z16.h-z19.h }, pn9.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc150be1a  // bfdot za.s[x9, 2], { z16.h-z19.h }, z0.h[3]\n"
+      ".inst 0xa043a73d  // ldnt1h { z28.h-z31.h }, pn9.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc150bf9b  // bfdot za.s[x9, 3], { z28.h-z31.h }, z0.h[3]\n"
+      "addvl x25, x25, #16\n"
+      "33:"  // Width 4: Multiply loop: multiply skip
+      "tbz %x[flags], #1, 34f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      "ld1rw { z29.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      "ld1rw { z18.s }, p2/Z, [x19]\n"
+      ".inst 0xc1b2cba8  // fclamp { z8.s-z11.s }, z29.s, z18.s\n"
+      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc1b2cbac  // fclamp { z12.s-z15.s }, z29.s, z18.s\n"
+      ".inst 0xc0062c60  // mova { z0.d-z3.d }, za.d[x9, #3]\n"
+      ".inst 0xa061c70c  // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc1b2cba4  // fclamp { z4.s-z7.s }, z29.s, z18.s\n"
+      ".inst 0xa062c704  // st1w { z4.s-z7.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc1b2cba0  // fclamp { z0.s-z3.s }, z29.s, z18.s\n"
+      ".inst 0xa063c300  // st1w { z0.s-z3.s }, p8, [x24, #0xc, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      "b 35f\n"
+      "34:"  // Width 4: No activation
+      ".inst 0xc0062c08  // mova { z8.d-z11.d }, za.d[x9, #0]\n"
+      ".inst 0xa060c708  // st1w { z8.s-z11.s }, pn9.b, [x24]\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xa061c70c  // st1w { z12.s-z15.s }, pn9.b, [x24, #0x4, MUL VL]\n"
+      ".inst 0xc0062c44  // mova { z4.d-z7.d }, za.d[x9, #2]\n"
+      ".inst 0xa062c704  // st1w { z4.s-z7.s }, pn9.b, [x24, #0x8, MUL VL]\n"
+      ".inst 0xc0062c60  // mova { z0.d-z3.d }, za.d[x9, #3]\n"
+      ".inst 0xa063c300  // st1w { z0.s-z3.s }, p8, [x24, #0xc, MUL VL]\n"
+      "addvl x24, x24, #16\n"
+      "35:"  // Width 4: Output done
+      "subs x27, x27, #0x4\n"
+      "sub %x[N], %x[N], x28, LSL #2\n"
+      "bgt 4b\n"
+      "36:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p2.b\n"
+      : [N] "+&r" (N)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [output_ptr] "r" (output_ptr)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
new file mode 100644
index 0000000000..4c9f9cff9a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int8_t *, size_t, size_t, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+void sme2_gemv_s8qa_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_s8qa_dot_16VL
+{
+public:
+    typedef int8_t operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<int32_t>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return false;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 4> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_s8qa_dot_16VL;
+    cls_sme2_gemv_s8qa_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..348c709119
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_s8qa_dot_16VL/generic.cpp
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sme2_gemv_s8qa_dot_16VL (
+    const int8_t *A_ptr, const int8_t *B_ptr, int8_t *output_ptr,
+    size_t N, size_t K,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    ARM_COMPUTE_UNUSED(col_base);
+
+    struct KernelArgs {
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x27, ALL, MUL #4\n"
+      "add x26, %x[N], x27\n"
+      "sub x26, x26, #0x1\n"
+      "udiv x26, x26, x27\n"
+      "add x21, x26, #0x3\n"
+      "and x21, x21, #0xfffffffffffffffc\n"
+      "mul x21, x21, x27\n"
+      "mov x9, #0x0\n"
+      "mov x25, %x[B_ptr]\n"
+      "mov x24, %x[output_ptr]\n"
+      "ptrue p2.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "mul x21, x21, %x[K]\n"
+      "mov x20, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x21, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x21, #0, 3f\n"
+      "lsr x21, x21, #0x1\n"
+      "lsl x20, x20, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x19, x21, #0x26\n"
+      "sub x20, x20, #0x1\n"
+      "lsl x20, x20, #0x16\n"
+      "orr x21, x21, x19\n"
+      "orr x21, x21, x20\n"
+      ".inst 0xf8b54b3a  // rprfm pldonce, x21, [x25]\n"
+      "3:"  // RHS prefetch exit
+      "mov x23, %x[col_bias]\n"
+      "mov z26.s, #0x0\n"
+      "mov z24.b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "4:"  // Column loop
+      "cmp x26, #0x4\n"
+      "bge 34f\n"
+      "cmp x26, #0x2\n"
+      "bgt 24f\n"
+      "beq 14f\n"
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "mov x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 5f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x20, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "8:"  // Width 1: Multiply loop: unique 1: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "10:"  // Width 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "11:"  // Width 1: Multiply loop: unique 2: skip row sum
+      "tbnz %x[flags], #31, 12f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "12:"  // Width 1: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "13:"  // Width 1: Output done
+      "b 44f\n"
+      "14:"  // Width 2
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "sub x19, %x[N], x27\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 15f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      "b 16f\n"
+      "15:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "16:"  // Width 2: setup done
+      "cmp x20, #0x10\n"
+      "ble 19f\n"
+      "17:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 18f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "18:"  // Width 2: Multiply loop: unique 3: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 17b\n"
+      "19:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "20:"  // Width 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 21f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "21:"  // Width 2: Multiply loop: unique 4: skip row sum
+      "tbnz %x[flags], #31, 22f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "22:"  // Width 2: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p2, [x24]\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "23:"  // Width 2: Output done
+      "b 44f\n"
+      "24:"  // Width 3
+      "mov x19, #0x2\n"
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 25f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      ".inst 0xa042c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      "b 26f\n"
+      "25:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "26:"  // Width 3: setup done
+      "cmp x20, #0x10\n"
+      "ble 29f\n"
+      "27:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "28:"  // Width 3: Multiply loop: unique 5: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 27b\n"
+      "29:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "30:"  // Width 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 31f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "31:"  // Width 3: Multiply loop: unique 6: skip row sum
+      "tbnz %x[flags], #31, 32f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "32:"  // Width 3: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z1.h, z2.h, z3.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p2, [x24]\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+      "uzp1 z0.b, z0.b, z1.b\n"
+      "st1b { z0.b }, p1, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "33:"  // Width 3: Output done
+      "b 44f\n"
+      "34:"  // Width 4
+      "mov x19, #0x3\n"
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 35f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      ".inst 0xa042c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      ".inst 0xa043c2f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x23, x23, #16\n"
+      "b 36f\n"
+      "35:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "36:"  // Width 4: setup done
+      "cmp x20, #0x10\n"
+      "ble 39f\n"
+      "37:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b1a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b5a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      ".inst 0xa043833d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153bba3  // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xa0438331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153be23  // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 38f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "38:"  // Width 4: Multiply loop: unique 7: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 37b\n"
+      "39:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b220  // sdot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b1a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6a0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b521  // sdot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b622  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b5a3  // sdot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153baa0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8a1  // sdot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9a2  // sdot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      ".inst 0xa043833d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153bba3  // sdot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bea0  // sdot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153bea1  // sdot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be22  // sdot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xa0438331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153be23  // sdot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "40:"  // Width 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 41f\n"
+      "sdot z26.s, z3.b, z24.b\n"
+      "41:"  // Width 4: Multiply loop: unique 8: skip row sum
+      "tbnz %x[flags], #31, 42f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "42:"  // Width 4: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc0062c68  // mova { z8.d-z11.d }, za.d[x9, #3]\n"
+      ".inst 0xc1a5ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+      ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a6ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0cea8  // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z1.h, z2.h, z3.h\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p2, [x24]\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+      "uzp1 z0.b, z0.b, z1.b\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z0.b }, p2, [x24, #2, MUL VL]\n"
+      "st1b { z8.b }, p1, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "43:"  // Width 4: Output done
+      "subs x26, x26, #0x4\n"
+      "sub %x[N], %x[N], x27, LSL #2\n"
+      "bgt 4b\n"
+      "44:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p2.b\n"
+      : [N] "+&r" (N), [flags] "+&r" (flags)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
new file mode 100644
index 0000000000..e15b95445e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_sme.hpp"
+
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint8_t *, size_t, size_t, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+void sme2_gemv_u8qa_dot_16VL( ARGLIST );
+
+class cls_sme2_gemv_u8qa_dot_16VL
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    static unsigned int out_width()
+    {
+        return sme::get_vector_length<uint32_t>() * 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_bias()
+    {
+        return false;
+    }
+
+    static constexpr bool supports_activation()
+    {
+        return false;
+    }
+
+
+    StdTransformsSME<operand_type, result_type, 1, 16, 4> transforms = {};
+
+
+    // Default to the generic kernel
+    kern_type kernel=sme2_gemv_u8qa_dot_16VL;
+    cls_sme2_gemv_u8qa_dot_16VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
new file mode 100644
index 0000000000..9822f637fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_gemv_u8qa_dot_16VL/generic.cpp
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sme2_gemv_u8qa_dot_16VL (
+    const uint8_t *A_ptr, const uint8_t *B_ptr, uint8_t *output_ptr,
+    size_t N, size_t K,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    ARM_COMPUTE_UNUSED(col_base);
+
+    struct KernelArgs {
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "cntw x27, ALL, MUL #4\n"
+      "add x26, %x[N], x27\n"
+      "sub x26, x26, #0x1\n"
+      "udiv x26, x26, x27\n"
+      "add x21, x26, #0x3\n"
+      "and x21, x21, #0xfffffffffffffffc\n"
+      "mul x21, x21, x27\n"
+      "mov x9, #0x0\n"
+      "mov x25, %x[B_ptr]\n"
+      "mov x24, %x[output_ptr]\n"
+      "ptrue p2.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "mul x21, x21, %x[K]\n"
+      "mov x20, #0x1\n"
+      "1:"  // RHS size check loop
+      "cmp x21, #0x200000\n"
+      "blt 2f\n"
+      "tbnz x21, #0, 3f\n"
+      "lsr x21, x21, #0x1\n"
+      "lsl x20, x20, #0x1\n"
+      "b 1b\n"
+      "2:"  // RHS do prefetch
+      "lsl x19, x21, #0x26\n"
+      "sub x20, x20, #0x1\n"
+      "lsl x20, x20, #0x16\n"
+      "orr x21, x21, x19\n"
+      "orr x21, x21, x20\n"
+      ".inst 0xf8b54b3a  // rprfm pldonce, x21, [x25]\n"
+      "3:"  // RHS prefetch exit
+      "mov x23, %x[col_bias]\n"
+      "mov z26.s, #0x0\n"
+      "mov z24.b, #0x1\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "4:"  // Column loop
+      "cmp x26, #0x4\n"
+      "bge 34f\n"
+      "cmp x26, #0x2\n"
+      "bgt 24f\n"
+      "beq 14f\n"
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "mov x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 5f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      "b 6f\n"
+      "5:"  // Width 1: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "6:"  // Width 1: setup done
+      "cmp x20, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Width 1: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "8:"  // Width 1: Multiply loop: unique 1: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Width 1: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 10f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "10:"  // Width 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "11:"  // Width 1: Multiply loop: unique 2: skip row sum
+      "tbnz %x[flags], #31, 12f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "12:"  // Width 1: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x24]\n"
+      "addvl x24, x24, #1\n"
+      "13:"  // Width 1: Output done
+      "b 44f\n"
+      "14:"  // Width 2
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "sub x19, %x[N], x27\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 15f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      "b 16f\n"
+      "15:"  // Width 2: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "16:"  // Width 2: setup done
+      "cmp x20, #0x10\n"
+      "ble 19f\n"
+      "17:"  // Width 2: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 18f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "18:"  // Width 2: Multiply loop: unique 3: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 17b\n"
+      "19:"  // Width 2: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 20f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "20:"  // Width 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 21f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "21:"  // Width 2: Multiply loop: unique 4: skip row sum
+      "tbnz %x[flags], #31, 22f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "22:"  // Width 2: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p2, [x24]\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "23:"  // Width 2: Output done
+      "b 44f\n"
+      "24:"  // Width 3
+      "mov x19, #0x2\n"
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 25f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      ".inst 0xa042c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      "b 26f\n"
+      "25:"  // Width 3: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "26:"  // Width 3: setup done
+      "cmp x20, #0x10\n"
+      "ble 29f\n"
+      "27:"  // Width 3: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "28:"  // Width 3: Multiply loop: unique 5: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 27b\n"
+      "29:"  // Width 3: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 30f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "30:"  // Width 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 31f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "31:"  // Width 3: Multiply loop: unique 6: skip row sum
+      "tbnz %x[flags], #31, 32f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "32:"  // Width 3: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z1.h, z2.h, z3.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p2, [x24]\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+      "uzp1 z0.b, z0.b, z1.b\n"
+      "st1b { z0.b }, p1, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "33:"  // Width 3: Output done
+      "b 44f\n"
+      "34:"  // Width 4
+      "mov x19, #0x3\n"
+      "mov x22, %x[A_ptr]\n"
+      "mov x21, %x[K]\n"
+      "msub x19, x27, x19, %x[N]\n"
+      "mov x20, %x[K]\n"
+      ".inst 0xf8b54ad8  // rprfm pldmany, x21, [x22]\n"
+      "whilelt p1.b, XZR, x19\n"
+      "cbz x23, 35f\n"
+      ".inst 0xa040c2e4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x23]\n"
+      ".inst 0xc0042c80  // mova za.d[x9, #0], { z4.d-z7.d }\n"
+      ".inst 0xa041c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x4, MUL VL]\n"
+      ".inst 0xc0042e81  // mova za.d[x9, #1], { z20.d-z23.d }\n"
+      ".inst 0xa042c2f4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x23, #0x8, MUL VL]\n"
+      ".inst 0xc0042e82  // mova za.d[x9, #2], { z20.d-z23.d }\n"
+      ".inst 0xa043c2f0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x23, #0xc, MUL VL]\n"
+      ".inst 0xc0042e03  // mova za.d[x9, #3], { z16.d-z19.d }\n"
+      "addvl x23, x23, #16\n"
+      "b 36f\n"
+      "35:"  // Width 4: no bias
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "36:"  // Width 4: setup done
+      "cmp x20, #0x10\n"
+      "ble 39f\n"
+      "37:"  // Width 4: Multiply loop: Main loop head
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b1b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b5b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      ".inst 0xa043833d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153bbb3  // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xa0438331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153be33  // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "tbnz %x[flags], #31, 38f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "38:"  // Width 4: Multiply loop: unique 7: skip row sum
+      "sub x20, x20, #0x10\n"
+      "cmp x20, #0x10\n"
+      "bgt 37b\n"
+      "39:"  // Width 4: Multiply loop: Single iteration only
+      "whilelt p0.b, XZR, x20\n"
+      "ld1rqb { z3.b }, p0/Z, [x22]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xa0408331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0xc153b230  // udot za.s[x9, 0], { z16.b-z19.b }, z3.b[0]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b0b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[0]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b1b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[0]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b1b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[0]\n"
+      "addvl x25, x25, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153b6b0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[1]\n"
+      ".inst 0xa0418329  // ldnt1b { z8.b-z11.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b531  // udot za.s[x9, 1], { z8.b-z11.b }, z3.b[1]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b632  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[1]\n"
+      ".inst 0xa043832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153b5b3  // udot za.s[x9, 3], { z12.b-z15.b }, z3.b[1]\n"
+      "addvl x25, x25, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      "subs x20, x20, #0x4\n"
+      ".inst 0xc153bab0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[2]\n"
+      ".inst 0xa0418325  // ldnt1b { z4.b-z7.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153b8b1  // udot za.s[x9, 1], { z4.b-z7.b }, z3.b[2]\n"
+      ".inst 0xa042832d  // ldnt1b { z12.b-z15.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153b9b2  // udot za.s[x9, 2], { z12.b-z15.b }, z3.b[2]\n"
+      ".inst 0xa043833d  // ldnt1b { z28.b-z31.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153bbb3  // udot za.s[x9, 3], { z28.b-z31.b }, z3.b[2]\n"
+      "addvl x25, x25, #16\n"
+      "ble 40f\n"
+      ".inst 0xa0408335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25]\n"
+      ".inst 0xc153beb0  // udot za.s[x9, 0], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0418335  // ldnt1b { z20.b-z23.b }, pn8.b/Z, [x25, #0x4, MUL VL]\n"
+      ".inst 0xc153beb1  // udot za.s[x9, 1], { z20.b-z23.b }, z3.b[3]\n"
+      ".inst 0xa0428331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0x8, MUL VL]\n"
+      ".inst 0xc153be32  // udot za.s[x9, 2], { z16.b-z19.b }, z3.b[3]\n"
+      ".inst 0xa0438331  // ldnt1b { z16.b-z19.b }, pn8.b/Z, [x25, #0xc, MUL VL]\n"
+      ".inst 0xc153be33  // udot za.s[x9, 3], { z16.b-z19.b }, z3.b[3]\n"
+      "addvl x25, x25, #16\n"
+      "40:"  // Width 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 41f\n"
+      "udot z26.s, z3.b, z24.b\n"
+      "41:"  // Width 4: Multiply loop: unique 8: skip row sum
+      "tbnz %x[flags], #31, 42f\n"
+      "add x20, %x[qp], %[b_offset]\n"
+      "mov x19, #0x4\n"
+      "ld1rw { z10.s }, p2/Z, [x20]\n"
+      "neg z10.s, p2/M, z10.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d26, p0, z26.s\n"
+      "mov z26.s, z26.s[0]\n"
+      "mul z26.s, p2/M, z26.s, z10.s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "42:"  // Width 4: skip row sum fixup
+      ".inst 0xc0904b40  // addha za0.s, p2/M, p2/M, z26.s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      ".inst 0xc0904b41  // addha za1.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "add x20, %x[qp], %[minval]\n"
+      ".inst 0xc0904b42  // addha za2.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0xc0904b43  // addha za3.s, p2/M, p2/M, z26.s\n"
+      "ld1rw { z21.s }, p2/Z, [x20]\n"
+      ".inst 0xc0062c1c  // mova { z28.d-z31.d }, za.d[x9, #0]\n"
+      ".inst 0xc1a5ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z5.s\n"
+      ".inst 0xc0062c2c  // mova { z12.d-z15.d }, za.d[x9, #1]\n"
+      ".inst 0xc1a5ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z5.s\n"
+      ".inst 0xc0062c40  // mova { z0.d-z3.d }, za.d[x9, #2]\n"
+      ".inst 0xc1a5ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z5.s\n"
+      ".inst 0xc0062c68  // mova { z8.d-z11.d }, za.d[x9, #3]\n"
+      ".inst 0xc1a5ac08  // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z5.s\n"
+      ".inst 0xc1a4aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      ".inst 0xc1a4aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z4.s\n"
+      ".inst 0xc1a4aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z4.s\n"
+      ".inst 0xc1a4aa28  // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a6ab08  // add { z8.s-z11.s }, { z8.s-z11.s }, z6.s\n"
+      ".inst 0xc1b0cebc  // sclamp { z28.s-z31.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0ceac  // sclamp { z12.s-z15.s }, z21.s, z16.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      ".inst 0xc1b0cea0  // sclamp { z0.s-z3.s }, z21.s, z16.s\n"
+      ".inst 0xc1b0cea8  // sclamp { z8.s-z11.s }, z21.s, z16.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z0.h, z0.h, z1.h\n"
+      "uzp1 z1.h, z2.h, z3.h\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p2, [x24]\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p2, [x24, #1, MUL VL]\n"
+      "uzp1 z0.b, z0.b, z1.b\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z0.b }, p2, [x24, #2, MUL VL]\n"
+      "st1b { z8.b }, p1, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "43:"  // Width 4: Output done
+      "subs x26, x26, #0x4\n"
+      "sub %x[N], %x[N], x27, LSL #2\n"
+      "bgt 4b\n"
+      "44:"  // Exit
+      ".inst 0xd503467f  // SMSTOP\n"
+      "ptrue p2.b\n"
+      : [N] "+&r" (N), [flags] "+&r" (flags)
+      : [A_ptr] "r" (A_ptr), [B_ptr] "r" (B_ptr), [K] "r" (K), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [output_ptr] "r" (output_ptr), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // ARM_COMPUTE_ENABLE_SME2
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..37eb63d898
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL
+{
+public:
+  typedef bfloat16 operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..bb8cad3357
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_1VLx4VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const bfloat16 *const A,
+      const bfloat16 *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const bfloat16 *const A;
+    const bfloat16 *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x14, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x14, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x13, x13, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w10, [%x[args], %[offsetof_M]]\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "ldr w27, [%x[args], %[offsetof_N]]\n"
+      "ldr x26, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x25, x26\n"
+      ".inst 0x25bb6790  // whilelt pn8.s, x28, x27, VLx4\n"
+      "tbnz x14, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "fmov z21.s, #1.0\n"
+      ".inst 0xa01cc27d  // ldnt1w { z28.s-z31.s }, p8/Z, [x19, x28, LSL #2]\n"
+      ".inst 0x809c02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
+      ".inst 0x809d02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
+      ".inst 0x809e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
+      ".inst 0x809f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x28\n"
+      "mov x20, x9\n"
+      "incw x19, ALL, MUL #4\n"
+      "incw x20\n"
+      "cmp x19, x27\n"
+      "csel x20, x9, x20, LT\n"
+      "mov x19, x14\n"
+      "bfm x14, XZR, #0x0, #0x0  // bfc x14, #0x0, #0x1\n"
+      "cmp x20, x10\n"
+      "csel x14, x19, x14, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x1\n"
+      "lsr x19, x19, #0x1\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x28, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      "ld1h { z0.h }, p0/Z, [x25]\n"
+      ".inst 0xa140a6db  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x22]\n"
+      "ld1h { z13.h }, p0/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa141a6ca  // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      "ld1h { z12.h }, p0/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa142a6cb  // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      "ld1h { z26.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa143a6d8  // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x81930000  // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0x81970001  // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
+      ".inst 0x819b0002  // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
+      ".inst 0x819f0003  // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+      "ld1h { z0.h }, p0/Z, [x25]\n"
+      ".inst 0x818201a0  // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
+      ".inst 0xa140a6db  // ldnt1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x22]\n"
+      ".inst 0x818601a1  // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
+      ".inst 0x818a01a2  // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
+      ".inst 0x818e01a3  // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
+      "ld1h { z13.h }, p0/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x81830180  // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
+      ".inst 0xa141a6ca  // ldnt1h { z2.h, z6.h, z10.h, z14.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0x81870181  // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
+      ".inst 0x818b0182  // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
+      ".inst 0x818f0183  // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
+      "ld1h { z12.h }, p0/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa142a6cb  // ldnt1h { z3.h, z7.h, z11.h, z15.h }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      ".inst 0x81900340  // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
+      ".inst 0x81940341  // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
+      ".inst 0x81980342  // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
+      ".inst 0x819c0343  // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
+      "ld1h { z26.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa143a6d8  // ldnt1h { z16.h, z20.h, z24.h, z28.h }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x81930000  // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+      ".inst 0x81970001  // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
+      ".inst 0x819b0002  // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
+      ".inst 0x819f0003  // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+      ".inst 0x818201a0  // bfmopa za0.s, p0/M, p0/M, z13.h, z2.h\n"
+      ".inst 0x818601a1  // bfmopa za1.s, p0/M, p0/M, z13.h, z6.h\n"
+      ".inst 0x818a01a2  // bfmopa za2.s, p0/M, p0/M, z13.h, z10.h\n"
+      ".inst 0x818e01a3  // bfmopa za3.s, p0/M, p0/M, z13.h, z14.h\n"
+      ".inst 0x81830180  // bfmopa za0.s, p0/M, p0/M, z12.h, z3.h\n"
+      ".inst 0x81870181  // bfmopa za1.s, p0/M, p0/M, z12.h, z7.h\n"
+      ".inst 0x818b0182  // bfmopa za2.s, p0/M, p0/M, z12.h, z11.h\n"
+      ".inst 0x818f0183  // bfmopa za3.s, p0/M, p0/M, z12.h, z15.h\n"
+      ".inst 0x81900340  // bfmopa za0.s, p0/M, p0/M, z26.h, z16.h\n"
+      ".inst 0x81940341  // bfmopa za1.s, p0/M, p0/M, z26.h, z20.h\n"
+      ".inst 0x81980342  // bfmopa za2.s, p0/M, p0/M, z26.h, z24.h\n"
+      ".inst 0x819c0343  // bfmopa za3.s, p0/M, p0/M, z26.h, z28.h\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1h { z0.h }, p0/Z, [x25]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x25, x25, #1\n"
+      ".inst 0xa140a6d3  // ld1h { z19.h, z23.h, z27.h, z31.h }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0x81930000  // bfmopa za0.s, p0/M, p0/M, z0.h, z19.h\n"
+      ".inst 0x81970001  // bfmopa za1.s, p0/M, p0/M, z0.h, z23.h\n"
+      ".inst 0x819b0002  // bfmopa za2.s, p0/M, p0/M, z0.h, z27.h\n"
+      ".inst 0x819f0003  // bfmopa za3.s, p0/M, p0/M, z0.h, z31.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x14, #1, 14f\n"
+      "tbz x14, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c578  // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
+      "addvl x13, x13, #16\n"
+      ".inst 0xa061c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa062c57c  // st1w { z28.s-z31.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c570  // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa060c56c  // st1w { z12.s-z15.s }, pn9.b, [x11]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c57c  // st1w { z28.s-z31.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c570  // st1w { z16.s-z19.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c564  // st1w { z4.s-z7.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x24, [%x[args], %[offsetof_C]]\n"
+      "add x24, x24, x28, LSL #2\n"  // C += n
+      "sub x23, x10, x9\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x24, x9, x22, x24\n"  // C += m * ldc
+      "tbz x14, #2, 18f\n"
+      "cntw x19\n"
+      "cmp x23, x19\n"
+      "csel x21, x23, x19, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa160c303  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 17f\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x23, x23, x21\n"
+      "beq 18f\n"
+      "b 22f\n"
+      "18:"  // Store to output array: Skip activation: End
+      "cntw x19\n"
+      "cmp x23, x19\n"
+      "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x19, x23, x19, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 20f\n"
+      "19:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      ".inst 0xa160c303  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 21f\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 21f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 21f\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "21:"  // Store to output array: Accumulator row 0 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x14, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x13, x13, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x28, ALL, MUL #4\n"
+      "cmp x28, x27\n"
+      "blt 3b\n"
+      "incw x9\n"
+      "cmp x9, x10\n"
+      "mov x28, #0x0\n"
+      "mov x26, x25\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..89c79cfb0a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL
+{
+public:
+  typedef bfloat16 operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..a4a40ad5ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_2VLx2VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const bfloat16 *const A,
+      const bfloat16 *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const bfloat16 *const A;
+    const bfloat16 *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc4530  // whilelt pn8.s, x9, x28, VLx2\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "fmov z21.s, #1.0\n"
+      ".inst 0xa009426f  // ldnt1w { z14.s-z15.s }, p8/Z, [x19, x9, LSL #2]\n"
+      ".inst 0x808e02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
+      ".inst 0x808f02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
+      ".inst 0x808e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
+      ".inst 0x808f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19, ALL, MUL #2\n"
+      "incw x20, ALL, MUL #2\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x1\n"
+      "lsr x19, x19, #0x1\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1402747  // ld1h { z7.h, z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0xa14026df  // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x22]\n"
+      ".inst 0xa0412748  // ld1h { z8.h-z9.h }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa04126c3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa1422752  // ld1h { z18.h, z26.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa04226d1  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa1432756  // ld1h { z22.h, z30.h }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14326cc  // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x819700e0  // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0x819f00e1  // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
+      ".inst 0x819701e2  // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
+      ".inst 0x819f01e3  // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+      ".inst 0xa1402747  // ld1h { z7.h, z15.h }, pn9.b/Z, [x26]\n"
+      ".inst 0x81820100  // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
+      ".inst 0xa14026df  // ldnt1h { z23.h, z31.h }, pn9.b/Z, [x22]\n"
+      ".inst 0x81830101  // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
+      ".inst 0x81820122  // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
+      ".inst 0x81830123  // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
+      ".inst 0xa0412748  // ld1h { z8.h-z9.h }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0x81900240  // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
+      ".inst 0xa04126c3  // ldnt1h { z2.h-z3.h }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0x81910241  // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
+      ".inst 0x81900342  // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
+      ".inst 0x81910343  // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
+      ".inst 0xa1422752  // ld1h { z18.h, z26.h }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa04226d1  // ldnt1h { z16.h-z17.h }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0x818402c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
+      ".inst 0x818c02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
+      ".inst 0x818403c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
+      ".inst 0x818c03c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
+      ".inst 0xa1432756  // ld1h { z22.h, z30.h }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14326cc  // ldnt1h { z4.h, z12.h }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x819700e0  // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+      ".inst 0x819f00e1  // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
+      ".inst 0x819701e2  // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
+      ".inst 0x819f01e3  // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+      ".inst 0x81820100  // bfmopa za0.s, p0/M, p0/M, z8.h, z2.h\n"
+      ".inst 0x81830101  // bfmopa za1.s, p0/M, p0/M, z8.h, z3.h\n"
+      ".inst 0x81820122  // bfmopa za2.s, p0/M, p0/M, z9.h, z2.h\n"
+      ".inst 0x81830123  // bfmopa za3.s, p0/M, p0/M, z9.h, z3.h\n"
+      ".inst 0x81900240  // bfmopa za0.s, p0/M, p0/M, z18.h, z16.h\n"
+      ".inst 0x81910241  // bfmopa za1.s, p0/M, p0/M, z18.h, z17.h\n"
+      ".inst 0x81900342  // bfmopa za2.s, p0/M, p0/M, z26.h, z16.h\n"
+      ".inst 0x81910343  // bfmopa za3.s, p0/M, p0/M, z26.h, z17.h\n"
+      ".inst 0x818402c0  // bfmopa za0.s, p0/M, p0/M, z22.h, z4.h\n"
+      ".inst 0x818c02c1  // bfmopa za1.s, p0/M, p0/M, z22.h, z12.h\n"
+      ".inst 0x818403c2  // bfmopa za2.s, p0/M, p0/M, z30.h, z4.h\n"
+      ".inst 0x818c03c3  // bfmopa za3.s, p0/M, p0/M, z30.h, z12.h\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1402747  // ld1h { z7.h, z15.h }, pn9.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xa14026d7  // ld1h { z23.h, z31.h }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0x819700e0  // bfmopa za0.s, p0/M, p0/M, z7.h, z23.h\n"
+      ".inst 0x819f00e1  // bfmopa za1.s, p0/M, p0/M, z7.h, z31.h\n"
+      ".inst 0x819701e2  // bfmopa za2.s, p0/M, p0/M, z15.h, z23.h\n"
+      ".inst 0x819f01e3  // bfmopa za3.s, p0/M, p0/M, z15.h, z31.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5b4  // st1w { z20.s-z23.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "tbz x15, #2, 21f\n"
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604327  // st1w { z7.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 21f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604327  // st1w { z7.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x19, 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa1604334  // st1w { z20.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604335  // st1w { z21.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      ".inst 0xa1604336  // st1w { z22.s, z30.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 21f\n"
+      "b 28f\n"
+      "21:"  // Store to output array: Skip activation: End
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 23f\n"
+      "22:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604327  // st1w { z7.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 24f\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4caa8  // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604320  // st1w { z0.s, z8.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604321  // st1w { z1.s, z9.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      ".inst 0xa1604322  // st1w { z2.s, z10.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "24:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 26f\n"
+      "25:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xa1604330  // st1w { z16.s, z24.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604331  // st1w { z17.s, z25.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604332  // st1w { z18.s, z26.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604333  // st1w { z19.s, z27.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 27f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604330  // st1w { z16.s, z24.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604331  // st1w { z17.s, z25.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      ".inst 0xa1604332  // st1w { z18.s, z26.s }, p8, [x25]\n"
+      "27:"  // Store to output array: Accumulator row 1 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x15, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x9, ALL, MUL #2\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..0d407e0cba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../bfloat.hpp"
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL
+{
+public:
+  typedef bfloat16 operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 2;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 2> transforms = {};
+
+  cls_sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..798a3cb470
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL(const bfloat16 *const A, const bfloat16 *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const bfloat16 *const A,
+      const bfloat16 *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 2) * sizeof(bfloat16)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops(((K / 2) - 1) / 2), n_tail_iters(((K / 2) - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const bfloat16 *const A;
+    const bfloat16 *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1c0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      "whilelt p0.s, x9, x28\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "fmov z8.s, #1.0\n"
+      "ldnt1w { z27.s }, p0/Z, [x19, x9, LSL #2]\n"
+      ".inst 0x809b2500  // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
+      ".inst 0x809b2501  // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
+      ".inst 0x809b2502  // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
+      ".inst 0x809b2503  // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19\n"
+      "incw x20, ALL, MUL #4\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x1\n"
+      "lsr x19, x19, #0x1\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa040a344  // ld1h { z4.h-z7.h }, pn8.b/Z, [x26]\n"
+      "ldnt1h { z29.h }, p1/Z, [x22]\n"
+      ".inst 0xa041a34c  // ld1h { z12.h-z15.h }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      "ldnt1h { z23.h }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa042a340  // ld1h { z0.h-z3.h }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1h { z21.h }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa143a352  // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x819d2480  // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0x819d24a1  // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
+      ".inst 0x819d24c2  // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
+      ".inst 0x819d24e3  // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+      ".inst 0xa040a344  // ld1h { z4.h-z7.h }, pn8.b/Z, [x26]\n"
+      ".inst 0x81972580  // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
+      "ldnt1h { z29.h }, p1/Z, [x22]\n"
+      ".inst 0x819725a1  // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
+      ".inst 0x819725c2  // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
+      ".inst 0x819725e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
+      ".inst 0xa041a34c  // ld1h { z12.h-z15.h }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0x81952400  // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
+      "ldnt1h { z23.h }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0x81952421  // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
+      ".inst 0x81952442  // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
+      ".inst 0x81952463  // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
+      ".inst 0xa042a340  // ld1h { z0.h-z3.h }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1h { z21.h }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0x819b2640  // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
+      ".inst 0x819b26c1  // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
+      ".inst 0x819b2742  // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
+      ".inst 0x819b27c3  // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
+      ".inst 0xa143a352  // ld1h { z18.h, z22.h, z26.h, z30.h }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1h { z27.h }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x819d2480  // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+      ".inst 0x819d24a1  // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
+      ".inst 0x819d24c2  // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
+      ".inst 0x819d24e3  // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+      ".inst 0x81972580  // bfmopa za0.s, p1/M, p1/M, z12.h, z23.h\n"
+      ".inst 0x819725a1  // bfmopa za1.s, p1/M, p1/M, z13.h, z23.h\n"
+      ".inst 0x819725c2  // bfmopa za2.s, p1/M, p1/M, z14.h, z23.h\n"
+      ".inst 0x819725e3  // bfmopa za3.s, p1/M, p1/M, z15.h, z23.h\n"
+      ".inst 0x81952400  // bfmopa za0.s, p1/M, p1/M, z0.h, z21.h\n"
+      ".inst 0x81952421  // bfmopa za1.s, p1/M, p1/M, z1.h, z21.h\n"
+      ".inst 0x81952442  // bfmopa za2.s, p1/M, p1/M, z2.h, z21.h\n"
+      ".inst 0x81952463  // bfmopa za3.s, p1/M, p1/M, z3.h, z21.h\n"
+      ".inst 0x819b2640  // bfmopa za0.s, p1/M, p1/M, z18.h, z27.h\n"
+      ".inst 0x819b26c1  // bfmopa za1.s, p1/M, p1/M, z22.h, z27.h\n"
+      ".inst 0x819b2742  // bfmopa za2.s, p1/M, p1/M, z26.h, z27.h\n"
+      ".inst 0x819b27c3  // bfmopa za3.s, p1/M, p1/M, z30.h, z27.h\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040a344  // ld1h { z4.h-z7.h }, pn8.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #4\n"
+      "ld1h { z29.h }, p1/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      ".inst 0x819d2480  // bfmopa za0.s, p1/M, p1/M, z4.h, z29.h\n"
+      ".inst 0x819d24a1  // bfmopa za1.s, p1/M, p1/M, z5.h, z29.h\n"
+      ".inst 0x819d24c2  // bfmopa za2.s, p1/M, p1/M, z6.h, z29.h\n"
+      ".inst 0x819d24e3  // bfmopa za3.s, p1/M, p1/M, z7.h, z29.h\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1c8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1cc  // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1c8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c1dc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c1b8  // st1w { z24.s-z27.s }, pn8.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c1a4  // st1w { z4.s-z7.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1a0  // st1w { z0.s-z3.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 42f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1b8  // st1w { z24.s-z27.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c1a0  // st1w { z0.s-z3.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1a8  // st1w { z8.s-z11.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 42f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "tbz x15, #2, 27f\n"
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z5.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "st1w { z6.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x19, 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      "st1w { z4.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z5.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      "st1w { z6.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 22f\n"
+      "21:"  // Store to output array: Skip activation: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Skip activation: Accumulator row 2 oddments
+      "cbz x19, 23f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      "st1w { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 23f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 23f\n"
+      "st1w { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "23:"  // Store to output array: Skip activation: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 25f\n"
+      "24:"  // Store to output array: Skip activation: Accumulator row 3 loop
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      "st1w { z4.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z5.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z7.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Skip activation: Accumulator row 3 oddments
+      "cbz x19, 26f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      "st1w { z12.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 26f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z13.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 26f\n"
+      "st1w { z14.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "26:"  // Store to output array: Skip activation: Accumulator row 3 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "b 40f\n"
+      "27:"  // Store to output array: Skip activation: End
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 29f\n"
+      "28:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "st1w { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z23.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 28b\n"
+      "29:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 30f\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb28  // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      "st1w { z8.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 30f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z9.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 30f\n"
+      "st1w { z10.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "30:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 40f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 32f\n"
+      "31:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 31b\n"
+      "32:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 33f\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 33f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 33f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "33:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 40f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 35f\n"
+      "34:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 34b\n"
+      "35:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x19, 36f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 36f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 36f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "36:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 40f\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 38f\n"
+      "37:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "st1w { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z23.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 37b\n"
+      "38:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x19, 39f\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 39f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 39f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "39:"  // Store to output array: Accumulator row 3 oddments: End
+      "40:"  // Store to output array: End
+      "tbz x15, #0, 42f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "41:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 41b\n"
+      "42:"  // End block
+      "incw x9\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #4\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..7777349b42
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL
+{
+public:
+  typedef float operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 1;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 1> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..4f6d9a3d98
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const float *const A,
+      const float *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(K * sizeof(float)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const float *const A;
+    const float *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x14, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x14, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c5a4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x13, x13, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w10, [%x[args], %[offsetof_M]]\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "ldr w27, [%x[args], %[offsetof_N]]\n"
+      "ldr x26, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x25, x26\n"
+      ".inst 0x25bb6790  // whilelt pn8.s, x28, x27, VLx4\n"
+      "tbnz x14, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "fmov z21.s, #1.0\n"
+      ".inst 0xa01cc27d  // ldnt1w { z28.s-z31.s }, p8/Z, [x19, x28, LSL #2]\n"
+      ".inst 0x809c02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z28.s\n"
+      ".inst 0x809d02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z29.s\n"
+      ".inst 0x809e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z30.s\n"
+      ".inst 0x809f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z31.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x28\n"
+      "mov x20, x9\n"
+      "incw x19, ALL, MUL #4\n"
+      "incw x20\n"
+      "cmp x19, x27\n"
+      "csel x20, x9, x20, LT\n"
+      "mov x19, x14\n"
+      "bfm x14, XZR, #0x0, #0x0  // bfc x14, #0x0, #0x1\n"
+      "cmp x20, x10\n"
+      "csel x14, x19, x14, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "lsr x22, x19, #0x2\n"
+      "and x21, x19, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_B]]\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x20, x28, x19, x20\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      "ld1w { z0.s }, p0/Z, [x25]\n"
+      ".inst 0xa140c69b  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x20]\n"
+      "ld1w { z13.s }, p0/Z, [x25, #1, MUL VL]\n"
+      ".inst 0xa141c68a  // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+      "ld1w { z12.s }, p0/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa142c68b  // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x20, #0x8, MUL VL]\n"
+      "ld1w { z26.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa143c698  // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x20, #0xc, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x80930000  // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0x80970001  // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
+      ".inst 0x809b0002  // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
+      ".inst 0x809f0003  // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+      "ld1w { z0.s }, p0/Z, [x25]\n"
+      ".inst 0x808201a0  // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
+      ".inst 0xa140c69b  // ldnt1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x20]\n"
+      ".inst 0x808601a1  // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
+      ".inst 0x808a01a2  // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
+      ".inst 0x808e01a3  // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
+      "ld1w { z13.s }, p0/Z, [x25, #1, MUL VL]\n"
+      ".inst 0x80830180  // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
+      ".inst 0xa141c68a  // ldnt1w { z2.s, z6.s, z10.s, z14.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+      ".inst 0x80870181  // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
+      ".inst 0x808b0182  // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
+      ".inst 0x808f0183  // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
+      "ld1w { z12.s }, p0/Z, [x25, #2, MUL VL]\n"
+      ".inst 0xa142c68b  // ldnt1w { z3.s, z7.s, z11.s, z15.s }, pn9.b/Z, [x20, #0x8, MUL VL]\n"
+      ".inst 0x80900340  // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
+      ".inst 0x80940341  // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
+      ".inst 0x80980342  // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
+      ".inst 0x809c0343  // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
+      "ld1w { z26.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      ".inst 0xa143c698  // ldnt1w { z16.s, z20.s, z24.s, z28.s }, pn9.b/Z, [x20, #0xc, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x80930000  // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+      ".inst 0x80970001  // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
+      ".inst 0x809b0002  // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
+      ".inst 0x809f0003  // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+      ".inst 0x808201a0  // fmopa za0.s, p0/M, p0/M, z13.s, z2.s\n"
+      ".inst 0x808601a1  // fmopa za1.s, p0/M, p0/M, z13.s, z6.s\n"
+      ".inst 0x808a01a2  // fmopa za2.s, p0/M, p0/M, z13.s, z10.s\n"
+      ".inst 0x808e01a3  // fmopa za3.s, p0/M, p0/M, z13.s, z14.s\n"
+      ".inst 0x80830180  // fmopa za0.s, p0/M, p0/M, z12.s, z3.s\n"
+      ".inst 0x80870181  // fmopa za1.s, p0/M, p0/M, z12.s, z7.s\n"
+      ".inst 0x808b0182  // fmopa za2.s, p0/M, p0/M, z12.s, z11.s\n"
+      ".inst 0x808f0183  // fmopa za3.s, p0/M, p0/M, z12.s, z15.s\n"
+      ".inst 0x80900340  // fmopa za0.s, p0/M, p0/M, z26.s, z16.s\n"
+      ".inst 0x80940341  // fmopa za1.s, p0/M, p0/M, z26.s, z20.s\n"
+      ".inst 0x80980342  // fmopa za2.s, p0/M, p0/M, z26.s, z24.s\n"
+      ".inst 0x809c0343  // fmopa za3.s, p0/M, p0/M, z26.s, z28.s\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1w { z0.s }, p0/Z, [x25]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x25, x25, #1\n"
+      ".inst 0xa140c693  // ld1w { z19.s, z23.s, z27.s, z31.s }, pn9.b/Z, [x20]\n"
+      "addvl x20, x20, #4\n"
+      ".inst 0x80930000  // fmopa za0.s, p0/M, p0/M, z0.s, z19.s\n"
+      ".inst 0x80970001  // fmopa za1.s, p0/M, p0/M, z0.s, z23.s\n"
+      ".inst 0x809b0002  // fmopa za2.s, p0/M, p0/M, z0.s, z27.s\n"
+      ".inst 0x809f0003  // fmopa za3.s, p0/M, p0/M, z0.s, z31.s\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x14, #1, 14f\n"
+      "tbz x14, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5bc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5ac  // ld1w { z12.s-z15.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c578  // st1w { z24.s-z27.s }, pn9.b, [x11]\n"
+      "addvl x13, x13, #16\n"
+      ".inst 0xa061c574  // st1w { z20.s-z23.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      ".inst 0xa062c57c  // st1w { z28.s-z31.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c570  // st1w { z16.s-z19.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xa060c56c  // st1w { z12.s-z15.s }, pn9.b, [x11]\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c57c  // st1w { z28.s-z31.s }, pn9.b, [x11, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c570  // st1w { z16.s-z19.s }, pn9.b, [x11, #0x8, MUL VL]\n"
+      ".inst 0xa063c564  // st1w { z4.s-z7.s }, pn9.b, [x11, #0xc, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x24, [%x[args], %[offsetof_C]]\n"
+      "add x24, x24, x28, LSL #2\n"  // C += n
+      "sub x23, x10, x9\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x24, x9, x22, x24\n"  // C += m * ldc
+      "tbz x14, #2, 18f\n"
+      "cntw x19\n"
+      "cmp x23, x19\n"
+      "csel x21, x23, x19, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa160c303  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 17f\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x23, x23, x21\n"
+      "beq 18f\n"
+      "b 22f\n"
+      "18:"  // Store to output array: Skip activation: End
+      "cntw x19\n"
+      "cmp x23, x19\n"
+      "ld1rw { z23.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x19, x23, x19, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "ld1rw { z16.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 20f\n"
+      "19:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      ".inst 0xa160c303  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 21f\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc1b0cae0  // fclamp { z0.s-z3.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0cae4  // fclamp { z4.s-z7.s }, z23.s, z16.s\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xc1b0cae8  // fclamp { z8.s-z11.s }, z23.s, z16.s\n"
+      ".inst 0xc1b0caec  // fclamp { z12.s-z15.s }, z23.s, z16.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c300  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 21f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c301  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x24]\n"
+      "add x24, x24, x22\n"
+      "beq 21f\n"
+      ".inst 0xa160c302  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x24]\n"
+      "21:"  // Store to output array: Accumulator row 0 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x14, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5b0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x13, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5a8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x13, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x13, x13, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x28, ALL, MUL #4\n"
+      "cmp x28, x27\n"
+      "blt 3b\n"
+      "incw x9\n"
+      "cmp x9, x10\n"
+      "mov x28, #0x0\n"
+      "mov x26, x25\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..51e8c43335
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL
+{
+public:
+  typedef float operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 1;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 1> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..344215bfa5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_2VLx2VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const float *const A,
+      const float *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(K * sizeof(float)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const float *const A;
+    const float *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc4530  // whilelt pn8.s, x9, x28, VLx2\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "fmov z21.s, #1.0\n"
+      ".inst 0xa009426f  // ldnt1w { z14.s-z15.s }, p8/Z, [x19, x9, LSL #2]\n"
+      ".inst 0x808e02a0  // fmopa za0.s, p0/M, p0/M, z21.s, z14.s\n"
+      ".inst 0x808f02a1  // fmopa za1.s, p0/M, p0/M, z21.s, z15.s\n"
+      ".inst 0x808e02a2  // fmopa za2.s, p0/M, p0/M, z21.s, z14.s\n"
+      ".inst 0x808f02a3  // fmopa za3.s, p0/M, p0/M, z21.s, z15.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19, ALL, MUL #2\n"
+      "incw x20, ALL, MUL #2\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "lsr x22, x19, #0x2\n"
+      "and x21, x19, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_B]]\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x20, x9, x19, x20\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa1404747  // ld1w { z7.s, z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0xa140469f  // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x20]\n"
+      ".inst 0xa0414748  // ld1w { z8.s-z9.s }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa0414683  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x20, #0x2, MUL VL]\n"
+      ".inst 0xa1424752  // ld1w { z18.s, z26.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa0424691  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+      ".inst 0xa1434756  // ld1w { z22.s, z30.s }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa143468c  // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x20, #0x6, MUL VL]\n"
+      "addvl x20, x20, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x809700e0  // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0x809f00e1  // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
+      ".inst 0x809701e2  // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
+      ".inst 0x809f01e3  // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+      ".inst 0xa1404747  // ld1w { z7.s, z15.s }, pn9.b/Z, [x26]\n"
+      ".inst 0x80820100  // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
+      ".inst 0xa140469f  // ldnt1w { z23.s, z31.s }, pn9.b/Z, [x20]\n"
+      ".inst 0x80830101  // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
+      ".inst 0x80820122  // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
+      ".inst 0x80830123  // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
+      ".inst 0xa0414748  // ld1w { z8.s-z9.s }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0x80900240  // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
+      ".inst 0xa0414683  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x20, #0x2, MUL VL]\n"
+      ".inst 0x80910241  // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
+      ".inst 0x80900342  // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
+      ".inst 0x80910343  // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
+      ".inst 0xa1424752  // ld1w { z18.s, z26.s }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa0424691  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x20, #0x4, MUL VL]\n"
+      ".inst 0x808402c0  // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
+      ".inst 0x808c02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
+      ".inst 0x808403c2  // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
+      ".inst 0x808c03c3  // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
+      ".inst 0xa1434756  // ld1w { z22.s, z30.s }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa143468c  // ldnt1w { z4.s, z12.s }, pn9.b/Z, [x20, #0x6, MUL VL]\n"
+      "addvl x20, x20, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x809700e0  // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+      ".inst 0x809f00e1  // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
+      ".inst 0x809701e2  // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
+      ".inst 0x809f01e3  // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+      ".inst 0x80820100  // fmopa za0.s, p0/M, p0/M, z8.s, z2.s\n"
+      ".inst 0x80830101  // fmopa za1.s, p0/M, p0/M, z8.s, z3.s\n"
+      ".inst 0x80820122  // fmopa za2.s, p0/M, p0/M, z9.s, z2.s\n"
+      ".inst 0x80830123  // fmopa za3.s, p0/M, p0/M, z9.s, z3.s\n"
+      ".inst 0x80900240  // fmopa za0.s, p0/M, p0/M, z18.s, z16.s\n"
+      ".inst 0x80910241  // fmopa za1.s, p0/M, p0/M, z18.s, z17.s\n"
+      ".inst 0x80900342  // fmopa za2.s, p0/M, p0/M, z26.s, z16.s\n"
+      ".inst 0x80910343  // fmopa za3.s, p0/M, p0/M, z26.s, z17.s\n"
+      ".inst 0x808402c0  // fmopa za0.s, p0/M, p0/M, z22.s, z4.s\n"
+      ".inst 0x808c02c1  // fmopa za1.s, p0/M, p0/M, z22.s, z12.s\n"
+      ".inst 0x808403c2  // fmopa za2.s, p0/M, p0/M, z30.s, z4.s\n"
+      ".inst 0x808c03c3  // fmopa za3.s, p0/M, p0/M, z30.s, z12.s\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1404747  // ld1w { z7.s, z15.s }, pn9.b/Z, [x26]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xa1404697  // ld1w { z23.s, z31.s }, pn9.b/Z, [x20]\n"
+      "addvl x20, x20, #2\n"
+      ".inst 0x809700e0  // fmopa za0.s, p0/M, p0/M, z7.s, z23.s\n"
+      ".inst 0x809f00e1  // fmopa za1.s, p0/M, p0/M, z7.s, z31.s\n"
+      ".inst 0x809701e2  // fmopa za2.s, p0/M, p0/M, z15.s, z23.s\n"
+      ".inst 0x809f01e3  // fmopa za3.s, p0/M, p0/M, z15.s, z31.s\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5b4  // st1w { z20.s-z23.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "tbz x15, #2, 21f\n"
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604327  // st1w { z7.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 21f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604327  // st1w { z7.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x19, 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa1604334  // st1w { z20.s, z28.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604335  // st1w { z21.s, z29.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      ".inst 0xa1604336  // st1w { z22.s, z30.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 21f\n"
+      "b 28f\n"
+      "21:"  // Store to output array: Skip activation: End
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "ld1rw { z21.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "ld1rw { z20.s }, p0/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 23f\n"
+      "22:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604327  // st1w { z7.s, z15.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 24f\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4caa8  // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604320  // st1w { z0.s, z8.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604321  // st1w { z1.s, z9.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      ".inst 0xa1604322  // st1w { z2.s, z10.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "24:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 26f\n"
+      "25:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xa1604330  // st1w { z16.s, z24.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604331  // st1w { z17.s, z25.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604332  // st1w { z18.s, z26.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      ".inst 0xa1604333  // st1w { z19.s, z27.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 27f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4cab8  // fclamp { z24.s-z27.s }, z21.s, z20.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604330  // st1w { z16.s, z24.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604331  // st1w { z17.s, z25.s }, p8, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      ".inst 0xa1604332  // st1w { z18.s, z26.s }, p8, [x25]\n"
+      "27:"  // Store to output array: Accumulator row 1 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x15, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x9, ALL, MUL #2\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..a315ebb323
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL
+{
+public:
+  typedef float operand_type;
+  typedef float result_type;
+
+  typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<float>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<float>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 1;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return true;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_fp32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 1> transforms = {};
+
+  cls_sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..5252e8140b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_fp32_mopa_4VLx1VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const float *const A,
+      const float *const B,
+      float *const C, const int ldc,
+      const int M, const int N, const int K,
+      const float *const bias,
+      const Activation act,
+      bool accumulate,
+      float *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(K * sizeof(float)),
+        C(C), ldcb(ldc * sizeof(float)),
+        M(M), N(N), K(K),
+        n_loops((K - 1) / 2), n_tail_iters((K - 1) % 2),
+        min(-std::numeric_limits<float>::infinity()),
+        max(std::numeric_limits<float>::infinity()),
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (act.type == Activation::Type::None)
+      {
+        flags |= 1 << 2;  // SKIP_ACTIVATION
+      }
+
+      // Initialise the activation values
+      switch (act.type)
+      {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            this->max = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            this->min = static_cast<float>(0);
+            break;
+      }
+    }
+
+    const float *const A;
+    const float *const B;
+    const long kstride_bytes;
+    float *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    float min = -std::numeric_limits<float>::infinity();
+    float max = std::numeric_limits<float>::infinity();
+
+    const float *const bias;
+
+    float *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, act, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1c0  // ld1w { z0.s-z3.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      "whilelt p0.s, x9, x28\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "fmov z8.s, #1.0\n"
+      "ldnt1w { z27.s }, p0/Z, [x19, x9, LSL #2]\n"
+      ".inst 0x809b2500  // fmopa za0.s, p1/M, p1/M, z8.s, z27.s\n"
+      ".inst 0x809b2501  // fmopa za1.s, p1/M, p1/M, z8.s, z27.s\n"
+      ".inst 0x809b2502  // fmopa za2.s, p1/M, p1/M, z8.s, z27.s\n"
+      ".inst 0x809b2503  // fmopa za3.s, p1/M, p1/M, z8.s, z27.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19\n"
+      "incw x20, ALL, MUL #4\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "lsr x22, x19, #0x2\n"
+      "and x21, x19, #0x3\n"
+      "ldr x20, [%x[args], %[offsetof_B]]\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x20, x9, x19, x20\n"  // bptr = B + n * kstride_bytes
+      "cbz x22, 8f\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0xa040c344  // ld1w { z4.s-z7.s }, pn8.b/Z, [x26]\n"
+      "ldnt1w { z29.s }, p1/Z, [x20]\n"
+      ".inst 0xa041c34c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      "ldnt1w { z23.s }, p1/Z, [x20, #1, MUL VL]\n"
+      ".inst 0xa042c340  // ld1w { z0.s-z3.s }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1w { z21.s }, p1/Z, [x20, #2, MUL VL]\n"
+      ".inst 0xa143c352  // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "addvl x20, x20, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0x809d2480  // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+      "subs x22, x22, #0x1\n"
+      ".inst 0x809d24a1  // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
+      ".inst 0x809d24c2  // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
+      ".inst 0x809d24e3  // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+      ".inst 0xa040c344  // ld1w { z4.s-z7.s }, pn8.b/Z, [x26]\n"
+      ".inst 0x80972580  // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
+      "ldnt1w { z29.s }, p1/Z, [x20]\n"
+      ".inst 0x809725a1  // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
+      ".inst 0x809725c2  // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
+      ".inst 0x809725e3  // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
+      ".inst 0xa041c34c  // ld1w { z12.s-z15.s }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0x80952400  // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
+      "ldnt1w { z23.s }, p1/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x80952421  // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
+      ".inst 0x80952442  // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
+      ".inst 0x80952463  // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
+      ".inst 0xa042c340  // ld1w { z0.s-z3.s }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1w { z21.s }, p1/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x809b2640  // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
+      ".inst 0x809b26c1  // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
+      ".inst 0x809b2742  // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
+      ".inst 0x809b27c3  // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
+      ".inst 0xa143c352  // ld1w { z18.s, z22.s, z26.s, z30.s }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "addvl x20, x20, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0x809d2480  // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+      ".inst 0x809d24a1  // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
+      ".inst 0x809d24c2  // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
+      ".inst 0x809d24e3  // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+      ".inst 0x80972580  // fmopa za0.s, p1/M, p1/M, z12.s, z23.s\n"
+      ".inst 0x809725a1  // fmopa za1.s, p1/M, p1/M, z13.s, z23.s\n"
+      ".inst 0x809725c2  // fmopa za2.s, p1/M, p1/M, z14.s, z23.s\n"
+      ".inst 0x809725e3  // fmopa za3.s, p1/M, p1/M, z15.s, z23.s\n"
+      ".inst 0x80952400  // fmopa za0.s, p1/M, p1/M, z0.s, z21.s\n"
+      ".inst 0x80952421  // fmopa za1.s, p1/M, p1/M, z1.s, z21.s\n"
+      ".inst 0x80952442  // fmopa za2.s, p1/M, p1/M, z2.s, z21.s\n"
+      ".inst 0x80952463  // fmopa za3.s, p1/M, p1/M, z3.s, z21.s\n"
+      ".inst 0x809b2640  // fmopa za0.s, p1/M, p1/M, z18.s, z27.s\n"
+      ".inst 0x809b26c1  // fmopa za1.s, p1/M, p1/M, z22.s, z27.s\n"
+      ".inst 0x809b2742  // fmopa za2.s, p1/M, p1/M, z26.s, z27.s\n"
+      ".inst 0x809b27c3  // fmopa za3.s, p1/M, p1/M, z30.s, z27.s\n"
+      "8:"  // K oddments
+      "cbz x21, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040c344  // ld1w { z4.s-z7.s }, pn8.b/Z, [x26]\n"
+      "subs x21, x21, #0x1\n"
+      "addvl x26, x26, #4\n"
+      "ld1w { z29.s }, p1/Z, [x20]\n"
+      "addvl x20, x20, #1\n"
+      ".inst 0x809d2480  // fmopa za0.s, p1/M, p1/M, z4.s, z29.s\n"
+      ".inst 0x809d24a1  // fmopa za1.s, p1/M, p1/M, z5.s, z29.s\n"
+      ".inst 0x809d24c2  // fmopa za2.s, p1/M, p1/M, z6.s, z29.s\n"
+      ".inst 0x809d24e3  // fmopa za3.s, p1/M, p1/M, z7.s, z29.s\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1c8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1cc  // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1c8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840502  // mova za2h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa043c1dc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c1b8  // st1w { z24.s-z27.s }, pn8.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c1a4  // st1w { z4.s-z7.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1a0  // st1w { z0.s-z3.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 42f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13]\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1b8  // st1w { z24.s-z27.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c1a0  // st1w { z0.s-z3.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1a8  // st1w { z8.s-z11.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 42f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "tbz x15, #2, 27f\n"
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Skip activation: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Skip activation: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      "st1w { z4.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z5.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 17f\n"
+      "st1w { z6.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "17:"  // Store to output array: Skip activation: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 19f\n"
+      "18:"  // Store to output array: Skip activation: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Skip activation: Accumulator row 1 oddments
+      "cbz x19, 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      "st1w { z4.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z5.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 20f\n"
+      "st1w { z6.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "20:"  // Store to output array: Skip activation: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 22f\n"
+      "21:"  // Store to output array: Skip activation: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Skip activation: Accumulator row 2 oddments
+      "cbz x19, 23f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      "st1w { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 23f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 23f\n"
+      "st1w { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "23:"  // Store to output array: Skip activation: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 25f\n"
+      "24:"  // Store to output array: Skip activation: Accumulator row 3 loop
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      "st1w { z4.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z5.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z6.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z7.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Skip activation: Accumulator row 3 oddments
+      "cbz x19, 26f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      "st1w { z12.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 26f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z13.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 26f\n"
+      "st1w { z14.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "26:"  // Store to output array: Skip activation: Accumulator row 3 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "b 40f\n"
+      "27:"  // Store to output array: Skip activation: End
+      "cntw x22\n"
+      "cmp x24, x22\n"
+      "ld1rw { z25.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_min]]\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "ld1rw { z24.s }, p1/Z, [%x[args], %[offsetof_KernelArgs_max]]\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 29f\n"
+      "28:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "st1w { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z23.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 28b\n"
+      "29:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 30f\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb28  // fclamp { z8.s-z11.s }, z25.s, z24.s\n"
+      "st1w { z8.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 30f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z9.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 30f\n"
+      "st1w { z10.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "30:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 40f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 32f\n"
+      "31:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 31b\n"
+      "32:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 33f\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 33f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 33f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "33:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 40f\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 35f\n"
+      "34:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 34b\n"
+      "35:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x19, 36f\n"
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 36f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 36f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "36:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 40f\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 38f\n"
+      "37:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1b8cb34  // fclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      "st1w { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1w { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z23.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 37b\n"
+      "38:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x19, 39f\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1b8cb30  // fclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 39f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 39f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "39:"  // Store to output array: Accumulator row 3 oddments: End
+      "40:"  // Store to output array: End
+      "tbz x15, #0, 42f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "41:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 41b\n"
+      "42:"  // End block
+      "incw x9\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #4\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_KernelArgs_max] "I" (offsetof(KernelArgs, max)), [offsetof_KernelArgs_min] "I" (offsetof(KernelArgs, min)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..b8bcd53c21
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int8_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..62170c4945
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<int8_t>::min();
+    int32_t max = std::numeric_limits<int8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x13, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x13, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa042c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x11, x11, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w9, [%x[args], %[offsetof_M]]\n"
+      "mov x28, #0x0\n"
+      "mov x27, #0x0\n"
+      "ldr w26, [%x[args], %[offsetof_N]]\n"
+      "ldr x25, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x24, x25\n"
+      ".inst 0x25ba6770  // whilelt pn8.s, x27, x26, VLx4\n"
+      "tbnz x13, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      ".inst 0xa01bc279  // ldnt1w { z24.s-z27.s }, p8/Z, [x19, x27, LSL #2]\n"
+      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
+      ".inst 0xc0902742  // addha za2.s, p1/M, p1/M, z26.s\n"
+      ".inst 0xc0902763  // addha za3.s, p1/M, p1/M, z27.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x27\n"
+      "mov x20, x28\n"
+      "incw x19, ALL, MUL #4\n"
+      "incw x20\n"
+      "cmp x19, x26\n"
+      "csel x20, x28, x20, LT\n"
+      "mov x19, x13\n"
+      "bfm x13, XZR, #0x0, #0x0  // bfc x13, #0x0, #0x1\n"
+      "cmp x20, x9\n"
+      "csel x13, x19, x13, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x27, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      "ld1b { z10.b }, p1/Z, [x24]\n"
+      ".inst 0xa04086dd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+      "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa04186cd  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286d9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386c1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa09c2540  // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa09d2541  // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09e2542  // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+      ".inst 0xa09f2543  // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      "ld1b { z10.b }, p1/Z, [x24]\n"
+      ".inst 0xa08c2600  // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+      ".inst 0xa04086dd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa08d2601  // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+      ".inst 0xa08e2602  // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+      ".inst 0xa08f2603  // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+      "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa09826a0  // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa04186cd  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa09926a1  // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+      ".inst 0xa09a26a2  // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+      ".inst 0xa09b26a3  // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+      "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286d9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      ".inst 0xa0802660  // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+      ".inst 0xa0812661  // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+      ".inst 0xa0822662  // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+      ".inst 0xa0832663  // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+      "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386c1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa09c2540  // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      ".inst 0xa09d2541  // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09e2542  // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+      ".inst 0xa09f2543  // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      ".inst 0xa08c2600  // smopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+      ".inst 0xa08d2601  // smopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+      ".inst 0xa08e2602  // smopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+      ".inst 0xa08f2603  // smopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+      ".inst 0xa09826a0  // smopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa09926a1  // smopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+      ".inst 0xa09a26a2  // smopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+      ".inst 0xa09b26a3  // smopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+      ".inst 0xa0802660  // smopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+      ".inst 0xa0812661  // smopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+      ".inst 0xa0822662  // smopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+      ".inst 0xa0832663  // smopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z10.b }, p1/Z, [x24]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xa04086dc  // ld1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0xa09c2540  // smopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      ".inst 0xa09d2541  // smopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa09e2542  // smopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+      ".inst 0xa09f2543  // smopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "ld1w { z14.s }, p1/Z, [x24]\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c2  // addva za2.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c3  // addva za3.s, p1/M, p1/M, z14.s\n"
+      "tbz x13, #1, 14f\n"
+      "tbz x13, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c578  // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c55c  // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xa061c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xa062c558  // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 11b\n"
+      "b 21f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c55c  // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa061c540  // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c550  // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 13b\n"
+      "b 21f\n"
+      "14:"  // Store to output array
+      "ldr x23, [%x[args], %[offsetof_C]]\n"
+      "add x23, x23, x27\n"  // C += n
+      "sub x22, x9, x28\n"
+      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x21, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x23, x28, x21, x23\n"  // C += m * ldc
+      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x13, #2, 15f\n"
+      "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+      "add x20, x20, x27\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa040c26c  // ld1w { z12.s-z15.s }, p8/Z, [x19]\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa040c264  // ld1w { z4.s-z7.s }, p8/Z, [x19]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x19\n"
+      "whilelt p0.b, x27, x26\n"
+      "cmp x22, x19\n"
+      "csel x19, x22, x19, LT\n"
+      "lsr x20, x19, #0x1\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x1\n"
+      "cbz x20, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086001a  // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc086005c  // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1aca41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc0860096  // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600d0  // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1ada41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+      ".inst 0xc1aea416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x20, LSL #1\n"
+      ".inst 0xc1afa410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+      ".inst 0xc1a4a23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+      ".inst 0xc1a5a23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+      ".inst 0xc1a6a236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
+      ".inst 0xc1a7a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+      ".inst 0xc1a1a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
+      ".inst 0xc1a1a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
+      ".inst 0xc1a1a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
+      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6bc  // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+      "uzp1 z19.b, z26.b, z28.b\n"
+      ".inst 0xc1b4c6b6  // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z22.b, z16.b\n"
+      "uzp1 z18.b, z27.b, z29.b\n"
+      "uzp1 z17.b, z23.b, z17.b\n"
+      "uzp1 z16.b, z19.b, z16.b\n"
+      "st1b { z16.b }, p0, [x23]\n"
+      "add x23, x23, x21\n"
+      "uzp1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p0, [x23]\n"
+      "add x23, x23, x21\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 18f\n"
+      ".inst 0xc0860002  // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc0860058  // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1aca402  // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
+      ".inst 0xc0860090  // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600ca  // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1ada418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+      ".inst 0xc1aea410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
+      ".inst 0xc1afa40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
+      ".inst 0xc1a4a222  // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
+      ".inst 0xc1a5a238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+      ".inst 0xc1a6a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
+      ".inst 0xc1a7a22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
+      ".inst 0xc1a1a302  // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
+      ".inst 0xc1a1a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
+      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+      ".inst 0xc1a1a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
+      ".inst 0xc1b4c6a2  // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+      "uzp1 z23.b, z2.b, z24.b\n"
+      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6aa  // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z16.b, z10.b\n"
+      "uzp1 z16.b, z23.b, z16.b\n"
+      "st1b { z16.b }, p0, [x23]\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "19:"  // Store to output array: End
+      "tbz x13, #0, 21f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "20:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x11, x11, #16\n"
+      "blt 20b\n"
+      "21:"  // End block
+      "incw x27, ALL, MUL #4\n"
+      "cmp x27, x26\n"
+      "blt 3b\n"
+      "incw x28\n"
+      "cmp x28, x9\n"
+      "mov x27, #0x0\n"
+      "mov x25, x24\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..954b0da0e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int8_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..e565699af5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<int8_t>::min();
+    int32_t max = std::numeric_limits<int8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc4530  // whilelt pn8.s, x9, x28, VLx2\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      ".inst 0xa0094275  // ldnt1w { z20.s-z21.s }, p8/Z, [x19, x9, LSL #2]\n"
+      ".inst 0xc0902680  // addha za0.s, p1/M, p1/M, z20.s\n"
+      ".inst 0xc09026a1  // addha za1.s, p1/M, p1/M, z21.s\n"
+      ".inst 0xc0902682  // addha za2.s, p1/M, p1/M, z20.s\n"
+      ".inst 0xc09026a3  // addha za3.s, p1/M, p1/M, z21.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19, ALL, MUL #2\n"
+      "incw x20, ALL, MUL #2\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa040075e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+      ".inst 0xa04006d1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa041074e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa04106c9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa0420740  // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa14206dc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa0430744  // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14306ca  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa09027c0  // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa09127c1  // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+      ".inst 0xa09027e2  // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+      ".inst 0xa09127e3  // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      ".inst 0xa040075e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+      ".inst 0xa08825c0  // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+      ".inst 0xa04006d1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa08925c1  // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+      ".inst 0xa08825e2  // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+      ".inst 0xa08925e3  // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+      ".inst 0xa041074e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa0942400  // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+      ".inst 0xa04106c9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa09c2401  // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+      ".inst 0xa0942422  // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+      ".inst 0xa09c2423  // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+      ".inst 0xa0420740  // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa14206dc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa0822480  // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+      ".inst 0xa08a2481  // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+      ".inst 0xa08224a2  // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+      ".inst 0xa08a24a3  // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+      ".inst 0xa0430744  // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14306ca  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa09027c0  // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      ".inst 0xa09127c1  // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+      ".inst 0xa09027e2  // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+      ".inst 0xa09127e3  // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      ".inst 0xa08825c0  // smopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+      ".inst 0xa08925c1  // smopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+      ".inst 0xa08825e2  // smopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+      ".inst 0xa08925e3  // smopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+      ".inst 0xa0942400  // smopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+      ".inst 0xa09c2401  // smopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+      ".inst 0xa0942422  // smopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+      ".inst 0xa09c2423  // smopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+      ".inst 0xa0822480  // smopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+      ".inst 0xa08a2481  // smopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+      ".inst 0xa08224a2  // smopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+      ".inst 0xa08a24a3  // smopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040075e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xa04006d0  // ld1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0xa09027c0  // smopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      ".inst 0xa09127c1  // smopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+      ".inst 0xa09027e2  // smopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+      ".inst 0xa09127e3  // smopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa040474e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x26]\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5b4  // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x15, #2, 15f\n"
+      "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+      "add x20, x20, x9\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa0404262  // ld1w { z2.s-z3.s }, p8/Z, [x19]\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa0404260  // ld1w { z0.s-z1.s }, p8/Z, [x19]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x22\n"
+      "whilelt p0.h, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1a3ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a0aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z12.h, z28.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z13.h, z29.h\n"
+      "uzp1 z17.h, z14.h, z30.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z15.h, z31.h\n"
+      "st1b { z17.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 18f\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+      ".inst 0xc1a3ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a0aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z28.h, z12.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "subs x19, x19, #0x1\n"
+      "uzp1 z16.h, z29.h, z13.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "uzp1 z16.h, z30.h, z14.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 22f\n"
+      "whilelt p0.h, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a0aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1abab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
+      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z16.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z5.h, z17.h\n"
+      "uzp1 z17.h, z6.h, z18.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z7.h, z19.h\n"
+      "st1b { z17.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 21f\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
+      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a0aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1abab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z20.h, z16.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "subs x19, x19, #0x1\n"
+      "uzp1 z16.h, z21.h, z17.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "uzp1 z16.h, z22.h, z18.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x15, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x9, ALL, MUL #2\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..420c219af5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int8_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8q_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..a738a10418
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8q_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8q_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<int8_t>::min();
+    int32_t max = std::numeric_limits<int8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1dc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1cc  // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      "whilelt p0.s, x9, x28\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "ldnt1w { z15.s }, p0/Z, [x19, x9, LSL #2]\n"
+      ".inst 0xc09025e0  // addha za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e1  // addha za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e2  // addha za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e3  // addha za3.s, p1/M, p1/M, z15.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19\n"
+      "incw x20, ALL, MUL #4\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1408352  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+      "ldnt1b { z0.b }, p1/Z, [x22]\n"
+      ".inst 0xa1418353  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa1428350  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa1438342  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0802640  // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa08026c1  // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+      ".inst 0xa0802742  // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+      ".inst 0xa08027c3  // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      ".inst 0xa1408352  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xa0892660  // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+      "ldnt1b { z0.b }, p1/Z, [x22]\n"
+      ".inst 0xa08926e1  // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+      ".inst 0xa0892762  // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+      ".inst 0xa08927e3  // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+      ".inst 0xa1418353  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa0952600  // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+      "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa0952681  // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+      ".inst 0xa0952702  // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+      ".inst 0xa0952783  // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+      ".inst 0xa1428350  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa08c2440  // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+      ".inst 0xa08c24c1  // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+      ".inst 0xa08c2542  // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+      ".inst 0xa08c25c3  // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+      ".inst 0xa1438342  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0802640  // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      ".inst 0xa08026c1  // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+      ".inst 0xa0802742  // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+      ".inst 0xa08027c3  // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      ".inst 0xa0892660  // smopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+      ".inst 0xa08926e1  // smopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+      ".inst 0xa0892762  // smopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+      ".inst 0xa08927e3  // smopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+      ".inst 0xa0952600  // smopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+      ".inst 0xa0952681  // smopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+      ".inst 0xa0952702  // smopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+      ".inst 0xa0952783  // smopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+      ".inst 0xa08c2440  // smopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+      ".inst 0xa08c24c1  // smopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+      ".inst 0xa08c2542  // smopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+      ".inst 0xa08c25c3  // smopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1408352  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #4\n"
+      "ld1b { z0.b }, p1/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      ".inst 0xa0802640  // smopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      ".inst 0xa08026c1  // smopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+      ".inst 0xa0802742  // smopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+      ".inst 0xa08027c3  // smopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa040c340  // ld1w { z0.s-z3.s }, pn8.b/Z, [x26]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0xc0912400  // addva za0.s, p1/M, p1/M, z0.s\n"
+      ".inst 0xc0912421  // addva za1.s, p1/M, p1/M, z1.s\n"
+      ".inst 0xc0912442  // addva za2.s, p1/M, p1/M, z2.s\n"
+      ".inst 0xc0912463  // addva za3.s, p1/M, p1/M, z3.s\n"
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c1b0  // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c1a8  // st1w { z8.s-z11.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1bc  // st1w { z28.s-z31.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1b0  // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c1b4  // st1w { z20.s-z23.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1b8  // st1w { z24.s-z27.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x15, #2, 15f\n"
+      "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+      "add x20, x20, x9\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      "ld1w { z8.s }, p0/Z, [x19]\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      "ld1w { z7.s }, p0/Z, [x19]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x22\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+      "st1b { z12.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z13.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z14.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z15.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 18f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+      "st1b { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "st1b { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+      "st1b { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 21f\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a8ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a4ccbc  // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
+      "st1b { z28.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z29.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "st1b { z30.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 23f\n"
+      "22:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc1a8ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc1a4ccb8  // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
+      "st1b { z24.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z25.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z26.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z27.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x19, 24f\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+      "st1b { z12.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z13.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      "st1b { z14.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "24:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 26f\n"
+      "25:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1a8ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+      ".inst 0xc1a4ccb4  // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
+      "st1b { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z23.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x19, 27f\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xc1a8ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
+      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a4cca0  // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
+      "st1b { z0.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z1.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      "st1b { z2.s }, p0, [x25]\n"
+      "27:"  // Store to output array: Accumulator row 3 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x15, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x9\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #4\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..c969c7aaff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int32_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..7ddd7c2e09
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_1VLx4VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+  ARM_COMPUTE_UNUSED(act);
+
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int32_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int32_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int32_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+
+    const int32_t *const bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x11, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x9, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x11, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c544  // ld1w { z4.s-z7.s }, pn9.b/Z, [x10]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c55c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x10, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa042c550  // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c540  // ld1w { z0.s-z3.s }, pn9.b/Z, [x10, #0xc, MUL VL]\n"
+      ".inst 0xc0840403  // mova za3h.s[x12], { z0.s-z3.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x10, x10, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w28, [%x[args], %[offsetof_M]]\n"
+      "mov x27, #0x0\n"
+      "mov x26, #0x0\n"
+      "ldr w25, [%x[args], %[offsetof_N]]\n"
+      "ldr x24, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x23, x24\n"
+      ".inst 0x25b96750  // whilelt pn8.s, x26, x25, VLx4\n"
+      "tbnz x11, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      ".inst 0xa11ac26a  // ldnt1w { z2.s, z6.s, z10.s, z14.s }, p8/Z, [x19, x26, LSL #2]\n"
+      ".inst 0xc0900040  // addha za0.s, p0/M, p0/M, z2.s\n"
+      ".inst 0xc09000c1  // addha za1.s, p0/M, p0/M, z6.s\n"
+      ".inst 0xc0900142  // addha za2.s, p0/M, p0/M, z10.s\n"
+      ".inst 0xc09001c3  // addha za3.s, p0/M, p0/M, z14.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x26\n"
+      "mov x20, x27\n"
+      "incw x19, ALL, MUL #4\n"
+      "incw x20\n"
+      "cmp x19, x25\n"
+      "csel x20, x27, x20, LT\n"
+      "mov x19, x11\n"
+      "bfm x11, XZR, #0x0, #0x0  // bfc x11, #0x0, #0x1\n"
+      "cmp x20, x28\n"
+      "csel x11, x19, x11, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x26, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      "ld1b { z20.b }, p0/Z, [x23]\n"
+      ".inst 0xa14086c9  // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x22]\n"
+      "ld1b { z10.b }, p0/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa14186da  // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      "ld1b { z16.b }, p0/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa14286cb  // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      "ld1b { z25.b }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0xa14386c8  // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0810280  // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa0850281  // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
+      ".inst 0xa0890282  // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
+      ".inst 0xa08d0283  // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+      "ld1b { z20.b }, p0/Z, [x23]\n"
+      ".inst 0xa0920140  // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
+      ".inst 0xa14086c9  // ldnt1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa0960141  // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
+      ".inst 0xa09a0142  // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
+      ".inst 0xa09e0143  // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
+      "ld1b { z10.b }, p0/Z, [x23, #1, MUL VL]\n"
+      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+      ".inst 0xa14186da  // ldnt1b { z18.b, z22.b, z26.b, z30.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa0870201  // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
+      ".inst 0xa08b0202  // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
+      ".inst 0xa08f0203  // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
+      "ld1b { z16.b }, p0/Z, [x23, #2, MUL VL]\n"
+      ".inst 0xa14286cb  // ldnt1b { z3.b, z7.b, z11.b, z15.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      ".inst 0xa0800320  // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
+      ".inst 0xa0840321  // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
+      ".inst 0xa0880322  // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
+      ".inst 0xa08c0323  // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
+      "ld1b { z25.b }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      ".inst 0xa14386c8  // ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0810280  // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+      ".inst 0xa0850281  // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
+      ".inst 0xa0890282  // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
+      ".inst 0xa08d0283  // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+      ".inst 0xa0920140  // smopa za0.s, p0/M, p0/M, z10.b, z18.b\n"
+      ".inst 0xa0960141  // smopa za1.s, p0/M, p0/M, z10.b, z22.b\n"
+      ".inst 0xa09a0142  // smopa za2.s, p0/M, p0/M, z10.b, z26.b\n"
+      ".inst 0xa09e0143  // smopa za3.s, p0/M, p0/M, z10.b, z30.b\n"
+      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+      ".inst 0xa0870201  // smopa za1.s, p0/M, p0/M, z16.b, z7.b\n"
+      ".inst 0xa08b0202  // smopa za2.s, p0/M, p0/M, z16.b, z11.b\n"
+      ".inst 0xa08f0203  // smopa za3.s, p0/M, p0/M, z16.b, z15.b\n"
+      ".inst 0xa0800320  // smopa za0.s, p0/M, p0/M, z25.b, z0.b\n"
+      ".inst 0xa0840321  // smopa za1.s, p0/M, p0/M, z25.b, z4.b\n"
+      ".inst 0xa0880322  // smopa za2.s, p0/M, p0/M, z25.b, z8.b\n"
+      ".inst 0xa08c0323  // smopa za3.s, p0/M, p0/M, z25.b, z12.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z20.b }, p0/Z, [x23]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x23, x23, #1\n"
+      ".inst 0xa14086c1  // ld1b { z1.b, z5.b, z9.b, z13.b }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0xa0810280  // smopa za0.s, p0/M, p0/M, z20.b, z1.b\n"
+      ".inst 0xa0850281  // smopa za1.s, p0/M, p0/M, z20.b, z5.b\n"
+      ".inst 0xa0890282  // smopa za2.s, p0/M, p0/M, z20.b, z9.b\n"
+      ".inst 0xa08d0283  // smopa za3.s, p0/M, p0/M, z20.b, z13.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x11, #1, 14f\n"
+      "tbz x11, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c55c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x10]\n"
+      ".inst 0xc0860404  // mova { z4.s-z7.s }, za0h.s[x12]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa041c540  // ld1w { z0.s-z3.s }, pn9.b/Z, [x10, #0x4, MUL VL]\n"
+      ".inst 0xc0840401  // mova za1h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c550  // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c54c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x10, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c524  // st1w { z4.s-z7.s }, pn9.b, [x9]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0xa061c534  // st1w { z20.s-z23.s }, pn9.b, [x9, #0x4, MUL VL]\n"
+      ".inst 0xa062c538  // st1w { z24.s-z27.s }, pn9.b, [x9, #0x8, MUL VL]\n"
+      ".inst 0xa063c53c  // st1w { z28.s-z31.s }, pn9.b, [x9, #0xc, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "blt 11b\n"
+      "b 20f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860414  // mova { z20.s-z23.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c534  // st1w { z20.s-z23.s }, pn9.b, [x9]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c520  // st1w { z0.s-z3.s }, pn9.b, [x9, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c528  // st1w { z8.s-z11.s }, pn9.b, [x9, #0x8, MUL VL]\n"
+      ".inst 0xa063c52c  // st1w { z12.s-z15.s }, pn9.b, [x9, #0xc, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      "blt 13b\n"
+      "b 20f\n"
+      "14:"  // Store to output array
+      "ldr x22, [%x[args], %[offsetof_C]]\n"
+      "sub x20, x28, x27\n"
+      "cntw x19\n"
+      "ldr x21, [%x[args], %[offsetof_ldcb]]\n"
+      "cmp x20, x19\n"
+      "csel x19, x20, x19, LT\n"
+      "add x22, x22, x26, LSL #2\n"  // C += n
+      "lsr x20, x19, #0x2\n"
+      "madd x22, x27, x21, x22\n"  // C += m * ldc
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c2c0  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x22]\n"
+      "add x22, x22, x21\n"
+      ".inst 0xa160c2c1  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x22]\n"
+      "add x22, x22, x21\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa160c2c2  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x22]\n"
+      "add x22, x22, x21\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa160c2c3  // st1w { z3.s, z7.s, z11.s, z15.s }, p8, [x22]\n"
+      "add x22, x22, x21\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa160c2c0  // st1w { z0.s, z4.s, z8.s, z12.s }, p8, [x22]\n"
+      "add x22, x22, x21\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa160c2c1  // st1w { z1.s, z5.s, z9.s, z13.s }, p8, [x22]\n"
+      "add x22, x22, x21\n"
+      "beq 17f\n"
+      ".inst 0xa160c2c2  // st1w { z2.s, z6.s, z10.s, z14.s }, p8, [x22]\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "18:"  // Store to output array: End
+      "tbz x11, #0, 20f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "19:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c54c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x10]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c550  // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c550  // ld1w { z16.s-z19.s }, pn9.b/Z, [x10, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c544  // ld1w { z4.s-z7.s }, pn9.b/Z, [x10, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x10, x10, #16\n"
+      "blt 19b\n"
+      "20:"  // End block
+      "incw x26, ALL, MUL #4\n"
+      "cmp x26, x25\n"
+      "blt 3b\n"
+      "incw x27\n"
+      "cmp x27, x28\n"
+      "mov x26, #0x0\n"
+      "mov x24, x23\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..a0705e50cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int32_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..9ae18f0e6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_2VLx2VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+  ARM_COMPUTE_UNUSED(act);
+
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int32_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int32_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int32_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+
+    const int32_t *const bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p0.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c8  // ld1w { z8.s-z11.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840500  // mova za0h.s[x12], { z8.s-z11.s }\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840603  // mova za3h.s[x12], { z16.s-z19.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc4530  // whilelt pn8.s, x9, x28, VLx2\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      ".inst 0xa109427c  // ldnt1w { z20.s, z28.s }, p8/Z, [x19, x9, LSL #2]\n"
+      ".inst 0xc0900280  // addha za0.s, p0/M, p0/M, z20.s\n"
+      ".inst 0xc0900381  // addha za1.s, p0/M, p0/M, z28.s\n"
+      ".inst 0xc0900282  // addha za2.s, p0/M, p0/M, z20.s\n"
+      ".inst 0xc0900383  // addha za3.s, p0/M, p0/M, z28.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19, ALL, MUL #2\n"
+      "incw x20, ALL, MUL #2\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1400756  // ld1b { z22.b, z30.b }, pn9.b/Z, [x26]\n"
+      ".inst 0xa14006d9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa1410750  // ld1b { z16.b, z24.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa14106cb  // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa0420748  // ld1b { z8.b-z9.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa04206d3  // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa0430744  // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14306dd  // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa09902c1  // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
+      ".inst 0xa09103c2  // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
+      ".inst 0xa09903c3  // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+      ".inst 0xa1400756  // ld1b { z22.b, z30.b }, pn9.b/Z, [x26]\n"
+      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+      ".inst 0xa14006d9  // ldnt1b { z17.b, z25.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa08b0201  // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
+      ".inst 0xa0830302  // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
+      ".inst 0xa08b0303  // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
+      ".inst 0xa1410750  // ld1b { z16.b, z24.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa0920100  // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
+      ".inst 0xa14106cb  // ldnt1b { z3.b, z11.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa0930101  // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
+      ".inst 0xa0920122  // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
+      ".inst 0xa0930123  // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
+      ".inst 0xa0420748  // ld1b { z8.b-z9.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa04206d3  // ldnt1b { z18.b-z19.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa0950080  // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
+      ".inst 0xa09d0081  // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
+      ".inst 0xa09500a2  // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
+      ".inst 0xa09d00a3  // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+      ".inst 0xa0430744  // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14306dd  // ldnt1b { z21.b, z29.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+      ".inst 0xa09902c1  // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
+      ".inst 0xa09103c2  // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
+      ".inst 0xa09903c3  // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+      ".inst 0xa0830200  // smopa za0.s, p0/M, p0/M, z16.b, z3.b\n"
+      ".inst 0xa08b0201  // smopa za1.s, p0/M, p0/M, z16.b, z11.b\n"
+      ".inst 0xa0830302  // smopa za2.s, p0/M, p0/M, z24.b, z3.b\n"
+      ".inst 0xa08b0303  // smopa za3.s, p0/M, p0/M, z24.b, z11.b\n"
+      ".inst 0xa0920100  // smopa za0.s, p0/M, p0/M, z8.b, z18.b\n"
+      ".inst 0xa0930101  // smopa za1.s, p0/M, p0/M, z8.b, z19.b\n"
+      ".inst 0xa0920122  // smopa za2.s, p0/M, p0/M, z9.b, z18.b\n"
+      ".inst 0xa0930123  // smopa za3.s, p0/M, p0/M, z9.b, z19.b\n"
+      ".inst 0xa0950080  // smopa za0.s, p0/M, p0/M, z4.b, z21.b\n"
+      ".inst 0xa09d0081  // smopa za1.s, p0/M, p0/M, z4.b, z29.b\n"
+      ".inst 0xa09500a2  // smopa za2.s, p0/M, p0/M, z5.b, z21.b\n"
+      ".inst 0xa09d00a3  // smopa za3.s, p0/M, p0/M, z5.b, z29.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1400756  // ld1b { z22.b, z30.b }, pn9.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xa14006d1  // ld1b { z17.b, z25.b }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0xa09102c0  // smopa za0.s, p0/M, p0/M, z22.b, z17.b\n"
+      ".inst 0xa09902c1  // smopa za1.s, p0/M, p0/M, z22.b, z25.b\n"
+      ".inst 0xa09103c2  // smopa za2.s, p0/M, p0/M, z30.b, z17.b\n"
+      ".inst 0xa09903c3  // smopa za3.s, p0/M, p0/M, z30.b, z25.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860418  // mova { z24.s-z27.s }, za0h.s[x12]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860468  // mova { z8.s-z11.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840582  // mova za2h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c5b8  // st1w { z24.s-z27.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 23f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13]\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860464  // mova { z4.s-z7.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 23f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "sub x24, x11, x10\n"
+      "cntw x23\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "cmp x24, x23\n"
+      "csel x21, x24, x23, LT\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "lsr x20, x21, #0x2\n"
+      "madd x25, x10, x22, x25\n"  // C += m * ldc
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860438  // mova { z24.s-z27.s }, za1h.s[x12]\n"
+      ".inst 0xa1604330  // st1w { z16.s, z24.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      ".inst 0xa1604331  // st1w { z17.s, z25.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604332  // st1w { z18.s, z26.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604333  // st1w { z19.s, z27.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa1604320  // st1w { z0.s, z8.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604321  // st1w { z1.s, z9.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 17f\n"
+      ".inst 0xa1604322  // st1w { z2.s, z10.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 21f\n"
+      "cmp x24, x23\n"
+      "csel x19, x24, x23, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 19f\n"
+      "18:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa1604330  // st1w { z16.s, z24.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      ".inst 0xa1604331  // st1w { z17.s, z25.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xa1604332  // st1w { z18.s, z26.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xa1604333  // st1w { z19.s, z27.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa1604324  // st1w { z4.s, z12.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xa1604325  // st1w { z5.s, z13.s }, p8, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 20f\n"
+      ".inst 0xa1604326  // st1w { z6.s, z14.s }, p8, [x25]\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments: End
+      "21:"  // Store to output array: End
+      "tbz x15, #0, 23f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "22:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 22b\n"
+      "23:"  // End block
+      "incw x9, ALL, MUL #2\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..be1106da13
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL
+{
+public:
+  typedef int8_t operand_type;
+  typedef int32_t result_type;
+
+  typedef void (*kern_type)(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<int32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<int32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4> transforms = {};
+
+  cls_sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..3623f5b6c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_s8s32_mopa_4VLx1VL(const int8_t *const A, const int8_t *const B, int32_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Activation act, bool accumulate, int32_t *const accumulator_buffer)
+{
+  ARM_COMPUTE_UNUSED(act);
+
+  struct KernelArgs
+  {
+    KernelArgs(
+      const int8_t *const A,
+      const int8_t *const B,
+      int32_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(int8_t)),
+        C(C), ldcb(ldc * sizeof(int32_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      }
+
+    const int8_t *const A;
+    const int8_t *const B;
+    const long kstride_bytes;
+    int32_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+
+    const int32_t *const bias;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1dc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa042c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      "whilelt p0.s, x9, x28\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "ldnt1w { z15.s }, p0/Z, [x19, x9, LSL #2]\n"
+      ".inst 0xc09025e0  // addha za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e1  // addha za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e2  // addha za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e3  // addha za3.s, p1/M, p1/M, z15.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19\n"
+      "incw x20, ALL, MUL #4\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa0408350  // ld1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "ldnt1b { z7.b }, p1/Z, [x22]\n"
+      ".inst 0xa041835c  // ld1b { z28.b-z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      "ldnt1b { z13.b }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa0428340  // ld1b { z0.b-z3.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1b { z12.b }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa0438358  // ld1b { z24.b-z27.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1b { z23.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa0872600  // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa0872621  // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
+      ".inst 0xa0872642  // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
+      ".inst 0xa0872663  // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+      ".inst 0xa0408350  // ld1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xa08d2780  // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
+      "ldnt1b { z7.b }, p1/Z, [x22]\n"
+      ".inst 0xa08d27a1  // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
+      ".inst 0xa08d27c2  // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
+      ".inst 0xa08d27e3  // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
+      ".inst 0xa041835c  // ld1b { z28.b-z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa08c2400  // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
+      "ldnt1b { z13.b }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa08c2421  // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
+      ".inst 0xa08c2442  // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
+      ".inst 0xa08c2463  // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
+      ".inst 0xa0428340  // ld1b { z0.b-z3.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1b { z12.b }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa0972700  // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
+      ".inst 0xa0972721  // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
+      ".inst 0xa0972742  // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
+      ".inst 0xa0972763  // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
+      ".inst 0xa0438358  // ld1b { z24.b-z27.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1b { z23.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa0872600  // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+      ".inst 0xa0872621  // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
+      ".inst 0xa0872642  // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
+      ".inst 0xa0872663  // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+      ".inst 0xa08d2780  // smopa za0.s, p1/M, p1/M, z28.b, z13.b\n"
+      ".inst 0xa08d27a1  // smopa za1.s, p1/M, p1/M, z29.b, z13.b\n"
+      ".inst 0xa08d27c2  // smopa za2.s, p1/M, p1/M, z30.b, z13.b\n"
+      ".inst 0xa08d27e3  // smopa za3.s, p1/M, p1/M, z31.b, z13.b\n"
+      ".inst 0xa08c2400  // smopa za0.s, p1/M, p1/M, z0.b, z12.b\n"
+      ".inst 0xa08c2421  // smopa za1.s, p1/M, p1/M, z1.b, z12.b\n"
+      ".inst 0xa08c2442  // smopa za2.s, p1/M, p1/M, z2.b, z12.b\n"
+      ".inst 0xa08c2463  // smopa za3.s, p1/M, p1/M, z3.b, z12.b\n"
+      ".inst 0xa0972700  // smopa za0.s, p1/M, p1/M, z24.b, z23.b\n"
+      ".inst 0xa0972721  // smopa za1.s, p1/M, p1/M, z25.b, z23.b\n"
+      ".inst 0xa0972742  // smopa za2.s, p1/M, p1/M, z26.b, z23.b\n"
+      ".inst 0xa0972763  // smopa za3.s, p1/M, p1/M, z27.b, z23.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa0408350  // ld1b { z16.b-z19.b }, pn8.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #4\n"
+      "ld1b { z7.b }, p1/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      ".inst 0xa0872600  // smopa za0.s, p1/M, p1/M, z16.b, z7.b\n"
+      ".inst 0xa0872621  // smopa za1.s, p1/M, p1/M, z17.b, z7.b\n"
+      ".inst 0xa0872642  // smopa za2.s, p1/M, p1/M, z18.b, z7.b\n"
+      ".inst 0xa0872663  // smopa za3.s, p1/M, p1/M, z19.b, z7.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0860400  // mova { z0.s-z3.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c1a0  // st1w { z0.s-z3.s }, pn8.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c1a8  // st1w { z8.s-z11.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c1bc  // st1w { z28.s-z31.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1b0  // st1w { z16.s-z19.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 29f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1a8  // st1w { z8.s-z11.s }, pn8.b, [x13]\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1a4  // st1w { z4.s-z7.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1a0  // st1w { z0.s-z3.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 29f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "sub x24, x11, x10\n"
+      "cntw x23\n"
+      "ldr x22, [%x[args], %[offsetof_ldcb]]\n"
+      "cmp x24, x23\n"
+      "csel x21, x24, x23, LT\n"
+      "add x25, x25, x9, LSL #2\n"  // C += n
+      "lsr x20, x21, #0x2\n"
+      "madd x25, x10, x22, x25\n"  // C += m * ldc
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 16f\n"
+      "15:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      "st1w { z28.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "st1w { z29.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z30.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z31.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "blt 15b\n"
+      "16:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 17f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      "st1w { z8.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 17f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z9.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 17f\n"
+      "st1w { z10.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x23\n"
+      "csel x21, x24, x23, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 19f\n"
+      "18:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      "st1w { z0.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "st1w { z1.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z2.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z3.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "blt 18b\n"
+      "19:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 20f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 20f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 20f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x23\n"
+      "csel x21, x24, x23, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 22f\n"
+      "21:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860450  // mova { z16.s-z19.s }, za2h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z19.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "blt 21b\n"
+      "22:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x19, 23f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860440  // mova { z0.s-z3.s }, za2h.s[x12]\n"
+      "st1w { z0.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 23f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z1.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 23f\n"
+      "st1w { z2.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 27f\n"
+      "cmp x24, x23\n"
+      "csel x19, x24, x23, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 25f\n"
+      "24:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      "st1w { z12.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "st1w { z13.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "add x12, x12, #0x4\n"
+      "st1w { z14.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "cmp x12, x20, LSL #2\n"
+      "st1w { z15.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "blt 24b\n"
+      "25:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x19, 26f\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      "st1w { z16.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 26f\n"
+      "subs x19, x19, #0x1\n"
+      "st1w { z17.s }, p0, [x25]\n"
+      "add x25, x25, x22\n"
+      "beq 26f\n"
+      "st1w { z18.s }, p0, [x25]\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments: End
+      "27:"  // Store to output array: End
+      "tbz x15, #0, 29f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "28:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c1cc  // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c1c8  // ld1w { z8.s-z11.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840503  // mova za3h.s[x12], { z8.s-z11.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 28b\n"
+      "29:"  // End block
+      "incw x9\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #4\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
new file mode 100644
index 0000000000..c7bd38d905
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL
+{
+public:
+  typedef uint8_t operand_type;
+  typedef uint8_t result_type;
+
+  typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<uint32_t>() * 1;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<uint32_t>() * 4;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_1VLx4VL;
+
+  StdTransformsSME<operand_type, result_type, 1, 4, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
new file mode 100644
index 0000000000..100f15c7e0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_1VLx4VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const uint8_t *const A,
+      const uint8_t *const B,
+      uint8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+        C(C), ldcb(ldc * sizeof(uint8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const uint8_t *const A;
+    const uint8_t *const B;
+    const long kstride_bytes;
+    uint8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<uint8_t>::min();
+    int32_t max = std::numeric_limits<uint8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x13, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x11, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x10, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x13, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840580  // mova za0h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa041c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840781  // mova za1h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa042c57c  // ld1w { z28.s-z31.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840782  // mova za2h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa043c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x11, x11, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w9, [%x[args], %[offsetof_M]]\n"
+      "mov x28, #0x0\n"
+      "mov x27, #0x0\n"
+      "ldr w26, [%x[args], %[offsetof_N]]\n"
+      "ldr x25, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x24, x25\n"
+      ".inst 0x25ba6770  // whilelt pn8.s, x27, x26, VLx4\n"
+      "tbnz x13, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      ".inst 0xa01bc279  // ldnt1w { z24.s-z27.s }, p8/Z, [x19, x27, LSL #2]\n"
+      ".inst 0xc0902700  // addha za0.s, p1/M, p1/M, z24.s\n"
+      ".inst 0xc0902721  // addha za1.s, p1/M, p1/M, z25.s\n"
+      ".inst 0xc0902742  // addha za2.s, p1/M, p1/M, z26.s\n"
+      ".inst 0xc0902763  // addha za3.s, p1/M, p1/M, z27.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x27\n"
+      "mov x20, x28\n"
+      "incw x19, ALL, MUL #4\n"
+      "incw x20\n"
+      "cmp x19, x26\n"
+      "csel x20, x28, x20, LT\n"
+      "mov x19, x13\n"
+      "bfm x13, XZR, #0x0, #0x0  // bfc x13, #0x0, #0x1\n"
+      "cmp x20, x9\n"
+      "csel x13, x19, x13, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x27, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      "ld1b { z10.b }, p1/Z, [x24]\n"
+      ".inst 0xa04086dd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+      "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa04186cd  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286d9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386c1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa1bc2540  // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1bd2541  // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1be2542  // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+      ".inst 0xa1bf2543  // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      "ld1b { z10.b }, p1/Z, [x24]\n"
+      ".inst 0xa1ac2600  // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+      ".inst 0xa04086dd  // ldnt1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa1ad2601  // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+      ".inst 0xa1ae2602  // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+      ".inst 0xa1af2603  // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+      "ld1b { z16.b }, p1/Z, [x24, #1, MUL VL]\n"
+      ".inst 0xa1b826a0  // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa04186cd  // ldnt1b { z12.b-z15.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa1b926a1  // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+      ".inst 0xa1ba26a2  // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+      ".inst 0xa1bb26a3  // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+      "ld1b { z21.b }, p1/Z, [x24, #2, MUL VL]\n"
+      ".inst 0xa04286d9  // ldnt1b { z24.b-z27.b }, pn9.b/Z, [x22, #0x8, MUL VL]\n"
+      ".inst 0xa1a02660  // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+      ".inst 0xa1a12661  // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+      ".inst 0xa1a22662  // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+      ".inst 0xa1a32663  // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+      "ld1b { z19.b }, p1/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0xa04386c1  // ldnt1b { z0.b-z3.b }, pn9.b/Z, [x22, #0xc, MUL VL]\n"
+      "addvl x22, x22, #16\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa1bc2540  // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      ".inst 0xa1bd2541  // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1be2542  // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+      ".inst 0xa1bf2543  // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      ".inst 0xa1ac2600  // umopa za0.s, p1/M, p1/M, z16.b, z12.b\n"
+      ".inst 0xa1ad2601  // umopa za1.s, p1/M, p1/M, z16.b, z13.b\n"
+      ".inst 0xa1ae2602  // umopa za2.s, p1/M, p1/M, z16.b, z14.b\n"
+      ".inst 0xa1af2603  // umopa za3.s, p1/M, p1/M, z16.b, z15.b\n"
+      ".inst 0xa1b826a0  // umopa za0.s, p1/M, p1/M, z21.b, z24.b\n"
+      ".inst 0xa1b926a1  // umopa za1.s, p1/M, p1/M, z21.b, z25.b\n"
+      ".inst 0xa1ba26a2  // umopa za2.s, p1/M, p1/M, z21.b, z26.b\n"
+      ".inst 0xa1bb26a3  // umopa za3.s, p1/M, p1/M, z21.b, z27.b\n"
+      ".inst 0xa1a02660  // umopa za0.s, p1/M, p1/M, z19.b, z0.b\n"
+      ".inst 0xa1a12661  // umopa za1.s, p1/M, p1/M, z19.b, z1.b\n"
+      ".inst 0xa1a22662  // umopa za2.s, p1/M, p1/M, z19.b, z2.b\n"
+      ".inst 0xa1a32663  // umopa za3.s, p1/M, p1/M, z19.b, z3.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      "ld1b { z10.b }, p1/Z, [x24]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xa04086dc  // ld1b { z28.b-z31.b }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #4\n"
+      ".inst 0xa1bc2540  // umopa za0.s, p1/M, p1/M, z10.b, z28.b\n"
+      ".inst 0xa1bd2541  // umopa za1.s, p1/M, p1/M, z10.b, z29.b\n"
+      ".inst 0xa1be2542  // umopa za2.s, p1/M, p1/M, z10.b, z30.b\n"
+      ".inst 0xa1bf2543  // umopa za3.s, p1/M, p1/M, z10.b, z31.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      "ld1w { z14.s }, p1/Z, [x24]\n"
+      "addvl x24, x24, #1\n"
+      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c2  // addva za2.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c3  // addva za3.s, p1/M, p1/M, z14.s\n"
+      "tbz x13, #1, 14f\n"
+      "tbz x13, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c578  // ld1w { z24.s-z27.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc0840700  // mova za0h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa042c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c564  // ld1w { z4.s-z7.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c55c  // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+      "addvl x11, x11, #16\n"
+      ".inst 0xa061c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      ".inst 0xa062c558  // st1w { z24.s-z27.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c54c  // st1w { z12.s-z15.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 11b\n"
+      "b 21f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc0860420  // mova { z0.s-z3.s }, za1h.s[x12]\n"
+      ".inst 0xa060c55c  // st1w { z28.s-z31.s }, pn9.b, [x10]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa061c540  // st1w { z0.s-z3.s }, pn9.b, [x10, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c548  // st1w { z8.s-z11.s }, pn9.b, [x10, #0x8, MUL VL]\n"
+      ".inst 0xa063c550  // st1w { z16.s-z19.s }, pn9.b, [x10, #0xc, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      "blt 13b\n"
+      "b 21f\n"
+      "14:"  // Store to output array
+      "ldr x23, [%x[args], %[offsetof_C]]\n"
+      "add x23, x23, x27\n"  // C += n
+      "sub x22, x9, x28\n"
+      "ld1rw { z12.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x21, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x23, x28, x21, x23\n"  // C += m * ldc
+      "ld1rw { z13.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z14.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z15.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z21.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z20.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x13, #2, 15f\n"
+      "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+      "add x20, x20, x27\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa040c26c  // ld1w { z12.s-z15.s }, p8/Z, [x19]\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa040c264  // ld1w { z4.s-z7.s }, p8/Z, [x19]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x19\n"
+      "whilelt p0.b, x27, x26\n"
+      "cmp x22, x19\n"
+      "csel x19, x22, x19, LT\n"
+      "lsr x20, x19, #0x1\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x1\n"
+      "cbz x20, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086001a  // mova { z26.s-z27.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc086005c  // mova { z28.s-z29.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1aca41a  // sqdmulh { z26.s-z27.s }, { z26.s-z27.s }, z12.s\n"
+      ".inst 0xc0860096  // mova { z22.s-z23.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600d0  // mova { z16.s-z17.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1ada41c  // sqdmulh { z28.s-z29.s }, { z28.s-z29.s }, z13.s\n"
+      ".inst 0xc1aea416  // sqdmulh { z22.s-z23.s }, { z22.s-z23.s }, z14.s\n"
+      "add x12, x12, #0x2\n"
+      "cmp x12, x20, LSL #1\n"
+      ".inst 0xc1afa410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z15.s\n"
+      ".inst 0xc1a4a23a  // srshl { z26.s-z27.s }, { z26.s-z27.s }, z4.s\n"
+      ".inst 0xc1a5a23c  // srshl { z28.s-z29.s }, { z28.s-z29.s }, z5.s\n"
+      ".inst 0xc1a6a236  // srshl { z22.s-z23.s }, { z22.s-z23.s }, z6.s\n"
+      ".inst 0xc1a7a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z7.s\n"
+      ".inst 0xc1a1a31a  // add { z26.s-z27.s }, { z26.s-z27.s }, z1.s\n"
+      ".inst 0xc1a1a31c  // add { z28.s-z29.s }, { z28.s-z29.s }, z1.s\n"
+      ".inst 0xc1a1a316  // add { z22.s-z23.s }, { z22.s-z23.s }, z1.s\n"
+      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+      ".inst 0xc1b4c6ba  // sclamp { z26.s-z27.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6bc  // sclamp { z28.s-z29.s }, z21.s, z20.s\n"
+      "uzp1 z19.b, z26.b, z28.b\n"
+      ".inst 0xc1b4c6b6  // sclamp { z22.s-z23.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z22.b, z16.b\n"
+      "uzp1 z18.b, z27.b, z29.b\n"
+      "uzp1 z17.b, z23.b, z17.b\n"
+      "uzp1 z16.b, z19.b, z16.b\n"
+      "st1b { z16.b }, p0, [x23]\n"
+      "add x23, x23, x21\n"
+      "uzp1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p0, [x23]\n"
+      "add x23, x23, x21\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 18f\n"
+      ".inst 0xc0860002  // mova { z2.s-z3.s }, za0h.s[x12, 0:1]\n"
+      ".inst 0xc0860058  // mova { z24.s-z25.s }, za1h.s[x12, 0:1]\n"
+      ".inst 0xc1aca402  // sqdmulh { z2.s-z3.s }, { z2.s-z3.s }, z12.s\n"
+      ".inst 0xc0860090  // mova { z16.s-z17.s }, za2h.s[x12, 0:1]\n"
+      ".inst 0xc08600ca  // mova { z10.s-z11.s }, za3h.s[x12, 0:1]\n"
+      ".inst 0xc1ada418  // sqdmulh { z24.s-z25.s }, { z24.s-z25.s }, z13.s\n"
+      ".inst 0xc1aea410  // sqdmulh { z16.s-z17.s }, { z16.s-z17.s }, z14.s\n"
+      ".inst 0xc1afa40a  // sqdmulh { z10.s-z11.s }, { z10.s-z11.s }, z15.s\n"
+      ".inst 0xc1a4a222  // srshl { z2.s-z3.s }, { z2.s-z3.s }, z4.s\n"
+      ".inst 0xc1a5a238  // srshl { z24.s-z25.s }, { z24.s-z25.s }, z5.s\n"
+      ".inst 0xc1a6a230  // srshl { z16.s-z17.s }, { z16.s-z17.s }, z6.s\n"
+      ".inst 0xc1a7a22a  // srshl { z10.s-z11.s }, { z10.s-z11.s }, z7.s\n"
+      ".inst 0xc1a1a302  // add { z2.s-z3.s }, { z2.s-z3.s }, z1.s\n"
+      ".inst 0xc1a1a318  // add { z24.s-z25.s }, { z24.s-z25.s }, z1.s\n"
+      ".inst 0xc1a1a310  // add { z16.s-z17.s }, { z16.s-z17.s }, z1.s\n"
+      ".inst 0xc1a1a30a  // add { z10.s-z11.s }, { z10.s-z11.s }, z1.s\n"
+      ".inst 0xc1b4c6a2  // sclamp { z2.s-z3.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6b8  // sclamp { z24.s-z25.s }, z21.s, z20.s\n"
+      "uzp1 z23.b, z2.b, z24.b\n"
+      ".inst 0xc1b4c6b0  // sclamp { z16.s-z17.s }, z21.s, z20.s\n"
+      ".inst 0xc1b4c6aa  // sclamp { z10.s-z11.s }, z21.s, z20.s\n"
+      "uzp1 z16.b, z16.b, z10.b\n"
+      "uzp1 z16.b, z23.b, z16.b\n"
+      "st1b { z16.b }, p0, [x23]\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "19:"  // Store to output array: End
+      "tbz x13, #0, 21f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "20:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c570  // ld1w { z16.s-z19.s }, pn9.b/Z, [x11, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c56c  // ld1w { z12.s-z15.s }, pn9.b/Z, [x11, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x11, x11, #16\n"
+      "blt 20b\n"
+      "21:"  // End block
+      "incw x27, ALL, MUL #4\n"
+      "cmp x27, x26\n"
+      "blt 3b\n"
+      "incw x28\n"
+      "cmp x28, x9\n"
+      "mov x27, #0x0\n"
+      "mov x25, x24\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
new file mode 100644
index 0000000000..123405bd17
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL
+{
+public:
+  typedef uint8_t operand_type;
+  typedef uint8_t result_type;
+
+  typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<uint32_t>() * 2;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<uint32_t>() * 2;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_2VLx2VL;
+
+  StdTransformsSME<operand_type, result_type, 2, 2, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
new file mode 100644
index 0000000000..6c42012482
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_2VLx2VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const uint8_t *const A,
+      const uint8_t *const B,
+      uint8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+        C(C), ldcb(ldc * sizeof(uint8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const uint8_t *const A;
+    const uint8_t *const B;
+    const long kstride_bytes;
+    uint8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<uint8_t>::min();
+    int32_t max = std::numeric_limits<uint8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207811  // ptrue pn9.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840400  // mova za0h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa041c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c5c0  // ld1w { z0.s-z3.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840402  // mova za2h.s[x12], { z0.s-z3.s }\n"
+      ".inst 0xa043c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840783  // mova za3h.s[x12], { z28.s-z31.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      ".inst 0x25bc4530  // whilelt pn8.s, x9, x28, VLx2\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      ".inst 0xa0094275  // ldnt1w { z20.s-z21.s }, p8/Z, [x19, x9, LSL #2]\n"
+      ".inst 0xc0902680  // addha za0.s, p1/M, p1/M, z20.s\n"
+      ".inst 0xc09026a1  // addha za1.s, p1/M, p1/M, z21.s\n"
+      ".inst 0xc0902682  // addha za2.s, p1/M, p1/M, z20.s\n"
+      ".inst 0xc09026a3  // addha za3.s, p1/M, p1/M, z21.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19, ALL, MUL #2\n"
+      "incw x20, ALL, MUL #2\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa040075e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+      ".inst 0xa04006d1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa041074e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa04106c9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa0420740  // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa14206dc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa0430744  // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14306ca  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa1b027c0  // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1b127c1  // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+      ".inst 0xa1b027e2  // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+      ".inst 0xa1b127e3  // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      ".inst 0xa040075e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+      ".inst 0xa1a825c0  // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+      ".inst 0xa04006d1  // ldnt1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+      ".inst 0xa1a925c1  // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+      ".inst 0xa1a825e2  // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+      ".inst 0xa1a925e3  // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+      ".inst 0xa041074e  // ld1b { z14.b-z15.b }, pn9.b/Z, [x26, #0x2, MUL VL]\n"
+      ".inst 0xa1b42400  // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+      ".inst 0xa04106c9  // ldnt1b { z8.b-z9.b }, pn9.b/Z, [x22, #0x2, MUL VL]\n"
+      ".inst 0xa1bc2401  // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+      ".inst 0xa1b42422  // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+      ".inst 0xa1bc2423  // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+      ".inst 0xa0420740  // ld1b { z0.b-z1.b }, pn9.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa14206dc  // ldnt1b { z20.b, z28.b }, pn9.b/Z, [x22, #0x4, MUL VL]\n"
+      ".inst 0xa1a22480  // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+      ".inst 0xa1aa2481  // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+      ".inst 0xa1a224a2  // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+      ".inst 0xa1aa24a3  // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+      ".inst 0xa0430744  // ld1b { z4.b-z5.b }, pn9.b/Z, [x26, #0x6, MUL VL]\n"
+      "addvl x26, x26, #8\n"
+      ".inst 0xa14306ca  // ldnt1b { z2.b, z10.b }, pn9.b/Z, [x22, #0x6, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa1b027c0  // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      ".inst 0xa1b127c1  // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+      ".inst 0xa1b027e2  // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+      ".inst 0xa1b127e3  // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      ".inst 0xa1a825c0  // umopa za0.s, p1/M, p1/M, z14.b, z8.b\n"
+      ".inst 0xa1a925c1  // umopa za1.s, p1/M, p1/M, z14.b, z9.b\n"
+      ".inst 0xa1a825e2  // umopa za2.s, p1/M, p1/M, z15.b, z8.b\n"
+      ".inst 0xa1a925e3  // umopa za3.s, p1/M, p1/M, z15.b, z9.b\n"
+      ".inst 0xa1b42400  // umopa za0.s, p1/M, p1/M, z0.b, z20.b\n"
+      ".inst 0xa1bc2401  // umopa za1.s, p1/M, p1/M, z0.b, z28.b\n"
+      ".inst 0xa1b42422  // umopa za2.s, p1/M, p1/M, z1.b, z20.b\n"
+      ".inst 0xa1bc2423  // umopa za3.s, p1/M, p1/M, z1.b, z28.b\n"
+      ".inst 0xa1a22480  // umopa za0.s, p1/M, p1/M, z4.b, z2.b\n"
+      ".inst 0xa1aa2481  // umopa za1.s, p1/M, p1/M, z4.b, z10.b\n"
+      ".inst 0xa1a224a2  // umopa za2.s, p1/M, p1/M, z5.b, z2.b\n"
+      ".inst 0xa1aa24a3  // umopa za3.s, p1/M, p1/M, z5.b, z10.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa040075e  // ld1b { z30.b-z31.b }, pn9.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xa04006d0  // ld1b { z16.b-z17.b }, pn9.b/Z, [x22]\n"
+      "addvl x22, x22, #2\n"
+      ".inst 0xa1b027c0  // umopa za0.s, p1/M, p1/M, z30.b, z16.b\n"
+      ".inst 0xa1b127c1  // umopa za1.s, p1/M, p1/M, z30.b, z17.b\n"
+      ".inst 0xa1b027e2  // umopa za2.s, p1/M, p1/M, z31.b, z16.b\n"
+      ".inst 0xa1b127e3  // umopa za3.s, p1/M, p1/M, z31.b, z17.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa040474e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x26]\n"
+      "addvl x26, x26, #2\n"
+      ".inst 0xc09125c0  // addva za0.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125c1  // addva za1.s, p1/M, p1/M, z14.s\n"
+      ".inst 0xc09125e2  // addva za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09125e3  // addva za3.s, p1/M, p1/M, z15.s\n"
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c5dc  // ld1w { z28.s-z31.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0860408  // mova { z8.s-z11.s }, za0h.s[x12]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xc0860434  // mova { z20.s-z23.s }, za1h.s[x12]\n"
+      ".inst 0xa041c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840701  // mova za1h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xc086045c  // mova { z28.s-z31.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xa042c5d8  // ld1w { z24.s-z27.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840702  // mova za2h.s[x12], { z24.s-z27.s }\n"
+      ".inst 0xa043c5cc  // ld1w { z12.s-z15.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840583  // mova za3h.s[x12], { z12.s-z15.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c5b4  // st1w { z20.s-z23.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 24f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0860424  // mova { z4.s-z7.s }, za1h.s[x12]\n"
+      ".inst 0xa060c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13]\n"
+      ".inst 0xc0860448  // mova { z8.s-z11.s }, za2h.s[x12]\n"
+      ".inst 0xc086046c  // mova { z12.s-z15.s }, za3h.s[x12]\n"
+      ".inst 0xa061c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 24f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ld1rw { z2.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "ld1rw { z3.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ld1rw { z0.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z1.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z11.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z25.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z24.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x15, #2, 15f\n"
+      "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+      "add x20, x20, x9\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa0404262  // ld1w { z2.s-z3.s }, p8/Z, [x19]\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      ".inst 0xa0404260  // ld1w { z0.s-z1.s }, p8/Z, [x19]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x22\n"
+      "whilelt p0.h, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z2.s\n"
+      ".inst 0xc1a3ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z3.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a0aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z0.s\n"
+      ".inst 0xc1a1aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z1.s\n"
+      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z12.h, z28.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z13.h, z29.h\n"
+      "uzp1 z17.h, z14.h, z30.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z15.h, z31.h\n"
+      "st1b { z17.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 18f\n"
+      ".inst 0xc086041c  // mova { z28.s-z31.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xc1a2ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z2.s\n"
+      ".inst 0xc1a3ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z3.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a0aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z0.s\n"
+      ".inst 0xc1a1aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z1.s\n"
+      ".inst 0xc1abab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z11.s\n"
+      ".inst 0xc1abab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z11.s\n"
+      ".inst 0xc1b8cf3c  // sclamp { z28.s-z31.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf2c  // sclamp { z12.s-z15.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z28.h, z12.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "subs x19, x19, #0x1\n"
+      "uzp1 z16.h, z29.h, z13.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "uzp1 z16.h, z30.h, z14.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 22f\n"
+      "whilelt p0.h, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860444  // mova { z4.s-z7.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac04  // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z2.s\n"
+      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a0aa24  // srshl { z4.s-z7.s }, { z4.s-z7.s }, z0.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1abab04  // add { z4.s-z7.s }, { z4.s-z7.s }, z11.s\n"
+      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc1b8cf24  // sclamp { z4.s-z7.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z4.h, z16.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z5.h, z17.h\n"
+      "uzp1 z17.h, z6.h, z18.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "uzp1 z16.h, z7.h, z19.h\n"
+      "st1b { z17.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 21f\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860470  // mova { z16.s-z19.s }, za3h.s[x12]\n"
+      ".inst 0xc1a2ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z2.s\n"
+      ".inst 0xc1a3ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z3.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a0aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z0.s\n"
+      ".inst 0xc1a1aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z1.s\n"
+      ".inst 0xc1abab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z11.s\n"
+      ".inst 0xc1abab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z11.s\n"
+      ".inst 0xc1b8cf34  // sclamp { z20.s-z23.s }, z25.s, z24.s\n"
+      ".inst 0xc1b8cf30  // sclamp { z16.s-z19.s }, z25.s, z24.s\n"
+      "uzp1 z16.h, z20.h, z16.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "subs x19, x19, #0x1\n"
+      "uzp1 z16.h, z21.h, z17.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "uzp1 z16.h, z22.h, z18.h\n"
+      "st1b { z16.h }, p0, [x25]\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "22:"  // Store to output array: End
+      "tbz x15, #0, 24f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "23:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14]\n"
+      ".inst 0xc0840600  // mova za0h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa041c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c5d0  // ld1w { z16.s-z19.s }, pn9.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c5c4  // ld1w { z4.s-z7.s }, pn9.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 23b\n"
+      "24:"  // End block
+      "incw x9, ALL, MUL #2\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #2\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
new file mode 100644
index 0000000000..2e61cf49a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_sme.hpp"
+
+namespace arm_gemm
+{
+
+// Implementations
+void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+class cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL
+{
+public:
+  typedef uint8_t operand_type;
+  typedef uint8_t result_type;
+
+  typedef void (*kern_type)(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer);
+
+  /* Kernel blocking parameters */
+  static unsigned int out_height()
+  {
+    return sme::get_vector_length<uint32_t>() * 4;
+  }
+
+  static unsigned int out_width()
+  {
+    return sme::get_vector_length<uint32_t>() * 1;
+  }
+
+  static constexpr unsigned int k_unroll()
+  {
+    return 4;
+  }
+
+  static constexpr bool supports_accumulate()
+  {
+    return false;
+  }
+
+  static constexpr bool supports_bias()
+  {
+    return true;
+  }
+
+  static constexpr bool supports_activation()
+  {
+    return false;
+  }
+
+  static constexpr bool is_sme()
+  {
+    return true;
+  }
+
+  // Default to the generic kernel
+  kern_type kernel = sme2_interleaved_nomerge_u8q_mopa_4VLx1VL;
+
+  StdTransformsSME<operand_type, result_type, 4, 1, 4, true> transforms = {};
+
+  cls_sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const CPUInfo *ci)
+  {
+    ARM_COMPUTE_UNUSED(ci);
+  }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
new file mode 100644
index 0000000000..40d2fff8c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_gemm.hpp"
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sme2_interleaved_nomerge_u8q_mopa_4VLx1VL(const uint8_t *const A, const uint8_t *const B, uint8_t *const C, int ldc, const int M, const int N, const int K, const int32_t *const bias, const Requantize32 &rq, const int n_0, bool accumulate, int32_t *const accumulator_buffer)
+{
+  struct KernelArgs
+  {
+    KernelArgs(
+      const uint8_t *const A,
+      const uint8_t *const B,
+      uint8_t *const C, const int ldc,
+      const int M, const int N, const int K,
+      const int32_t *const bias,
+      const Requantize32 &rq,
+      const int n_0,
+      bool accumulate,
+      int32_t *const accumulator_buffer
+    ) : A(A),
+        B(B), kstride_bytes(roundup(K, 4) * sizeof(uint8_t)),
+        C(C), ldcb(ldc * sizeof(uint8_t)),
+        M(M), N(N), K(K),
+        n_loops(((K / 4) - 1) / 2), n_tail_iters(((K / 4) - 1) % 2),
+
+        bias(bias), n_0(n_0),
+        accumulator_buffer(accumulator_buffer),
+        flags(0x0)
+    {
+      if (accumulate)
+      {
+        flags |= 1 << 0;  // FILL_ACCUMULATORS_FROM_BUFFER
+      }
+      if (C == nullptr)
+      {
+        flags |= 1 << 1;  // STORE_ACCUMULATORS_TO_BUFFER
+      }
+      if (rq.per_channel_requant)
+      {
+        flags |= 1 << 2;  // PER_CHANNEL_QUANTISATION
+      }
+      }
+
+    const uint8_t *const A;
+    const uint8_t *const B;
+    const long kstride_bytes;
+    uint8_t *const C;
+    const long ldcb;
+    const long M, N, K, n_loops, n_tail_iters;
+    int32_t min = std::numeric_limits<uint8_t>::min();
+    int32_t max = std::numeric_limits<uint8_t>::max();
+
+    const int32_t *const bias;
+    const int n_0;
+
+    int32_t *const accumulator_buffer;
+    uint64_t flags;
+  };
+
+  // Construct arguments for this kernel
+  KernelArgs args(A, B, C, ldc, M, N, K, bias, rq, n_0, accumulate, accumulator_buffer);
+
+  __asm__ __volatile__(
+      "ldr x15, [%x[args], %[offsetof_flags]]\n"
+      ".inst 0xd503477f  // SMSTART ZA\n"
+      "ptrue p1.b\n"
+      ".inst 0x25207810  // ptrue pn8.b\n"
+      "ldr x14, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "ldr x13, [%x[args], %[offsetof_accumulator_buffer]]\n"
+      "tbz x15, #0, 2f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "1:"  // Initial accumulator load from buffer: Loop
+      ".inst 0xa040c1dc  // ld1w { z28.s-z31.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840780  // mova za0h.s[x12], { z28.s-z31.s }\n"
+      ".inst 0xa041c1cc  // ld1w { z12.s-z15.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840581  // mova za1h.s[x12], { z12.s-z15.s }\n"
+      ".inst 0xa042c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840682  // mova za2h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xa043c1d8  // ld1w { z24.s-z27.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840703  // mova za3h.s[x12], { z24.s-z27.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 1b\n"
+      "2:"  // Initial accumulator load from buffer: End
+      "ldr w11, [%x[args], %[offsetof_M]]\n"
+      "mov x10, #0x0\n"
+      "mov x9, #0x0\n"
+      "ldr w28, [%x[args], %[offsetof_N]]\n"
+      "ldr x27, [%x[args], %[offsetof_A]]\n"
+      "3:"  // M and N loop
+      "mov x26, x27\n"
+      "whilelt p0.s, x9, x28\n"
+      "tbnz x15, #0, 4f\n"
+      "ldr x19, [%x[args], %[offsetof_bias]]\n"
+      ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
+      "cbz x19, 5f\n"
+      "ldnt1w { z15.s }, p0/Z, [x19, x9, LSL #2]\n"
+      ".inst 0xc09025e0  // addha za0.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e1  // addha za1.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e2  // addha za2.s, p1/M, p1/M, z15.s\n"
+      ".inst 0xc09025e3  // addha za3.s, p1/M, p1/M, z15.s\n"
+      "4:"  // Prepare accumulators: Test for last block
+      "mov x19, x9\n"
+      "mov x20, x10\n"
+      "incw x19\n"
+      "incw x20, ALL, MUL #4\n"
+      "cmp x19, x28\n"
+      "csel x20, x10, x20, LT\n"
+      "mov x19, x15\n"
+      "bfm x15, XZR, #0x0, #0x0  // bfc x15, #0x0, #0x1\n"
+      "cmp x20, x11\n"
+      "csel x15, x19, x15, LT\n"
+      "5:"  // Prepare accumulators: End
+      "ldr x19, [%x[args], %[offsetof_K]]\n"
+      "add x19, x19, #0x3\n"
+      "lsr x19, x19, #0x2\n"
+      "ldr x22, [%x[args], %[offsetof_B]]\n"
+      "lsr x21, x19, #0x2\n"
+      "and x20, x19, #0x3\n"
+      "ldr x19, [%x[args], %[offsetof_kstride_bytes]]\n"
+      "madd x22, x9, x19, x22\n"  // bptr = B + n * kstride_bytes
+      "cbz x21, 8f\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1408352  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+      "ldnt1b { z0.b }, p1/Z, [x22]\n"
+      ".inst 0xa1418353  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa1428350  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa1438342  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "ble 7f\n"
+      "6:"  // K loop
+      ".inst 0xa1a02640  // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      "subs x21, x21, #0x1\n"
+      ".inst 0xa1a026c1  // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+      ".inst 0xa1a02742  // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+      ".inst 0xa1a027c3  // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      ".inst 0xa1408352  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+      ".inst 0xa1a92660  // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+      "ldnt1b { z0.b }, p1/Z, [x22]\n"
+      ".inst 0xa1a926e1  // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+      ".inst 0xa1a92762  // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+      ".inst 0xa1a927e3  // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+      ".inst 0xa1418353  // ld1b { z19.b, z23.b, z27.b, z31.b }, pn8.b/Z, [x26, #0x4, MUL VL]\n"
+      ".inst 0xa1b52600  // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+      "ldnt1b { z9.b }, p1/Z, [x22, #1, MUL VL]\n"
+      ".inst 0xa1b52681  // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+      ".inst 0xa1b52702  // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+      ".inst 0xa1b52783  // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+      ".inst 0xa1428350  // ld1b { z16.b, z20.b, z24.b, z28.b }, pn8.b/Z, [x26, #0x8, MUL VL]\n"
+      "ldnt1b { z21.b }, p1/Z, [x22, #2, MUL VL]\n"
+      ".inst 0xa1ac2440  // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+      ".inst 0xa1ac24c1  // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+      ".inst 0xa1ac2542  // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+      ".inst 0xa1ac25c3  // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+      ".inst 0xa1438342  // ld1b { z2.b, z6.b, z10.b, z14.b }, pn8.b/Z, [x26, #0xc, MUL VL]\n"
+      "addvl x26, x26, #16\n"
+      "ldnt1b { z12.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "bgt 6b\n"
+      "7:"  // K loop tail
+      ".inst 0xa1a02640  // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      ".inst 0xa1a026c1  // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+      ".inst 0xa1a02742  // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+      ".inst 0xa1a027c3  // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      ".inst 0xa1a92660  // umopa za0.s, p1/M, p1/M, z19.b, z9.b\n"
+      ".inst 0xa1a926e1  // umopa za1.s, p1/M, p1/M, z23.b, z9.b\n"
+      ".inst 0xa1a92762  // umopa za2.s, p1/M, p1/M, z27.b, z9.b\n"
+      ".inst 0xa1a927e3  // umopa za3.s, p1/M, p1/M, z31.b, z9.b\n"
+      ".inst 0xa1b52600  // umopa za0.s, p1/M, p1/M, z16.b, z21.b\n"
+      ".inst 0xa1b52681  // umopa za1.s, p1/M, p1/M, z20.b, z21.b\n"
+      ".inst 0xa1b52702  // umopa za2.s, p1/M, p1/M, z24.b, z21.b\n"
+      ".inst 0xa1b52783  // umopa za3.s, p1/M, p1/M, z28.b, z21.b\n"
+      ".inst 0xa1ac2440  // umopa za0.s, p1/M, p1/M, z2.b, z12.b\n"
+      ".inst 0xa1ac24c1  // umopa za1.s, p1/M, p1/M, z6.b, z12.b\n"
+      ".inst 0xa1ac2542  // umopa za2.s, p1/M, p1/M, z10.b, z12.b\n"
+      ".inst 0xa1ac25c3  // umopa za3.s, p1/M, p1/M, z14.b, z12.b\n"
+      "8:"  // K oddments
+      "cbz x20, 10f\n"
+      "9:"  // K oddments: Loop
+      ".inst 0xa1408352  // ld1b { z18.b, z22.b, z26.b, z30.b }, pn8.b/Z, [x26]\n"
+      "subs x20, x20, #0x1\n"
+      "addvl x26, x26, #4\n"
+      "ld1b { z0.b }, p1/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      ".inst 0xa1a02640  // umopa za0.s, p1/M, p1/M, z18.b, z0.b\n"
+      ".inst 0xa1a026c1  // umopa za1.s, p1/M, p1/M, z22.b, z0.b\n"
+      ".inst 0xa1a02742  // umopa za2.s, p1/M, p1/M, z26.b, z0.b\n"
+      ".inst 0xa1a027c3  // umopa za3.s, p1/M, p1/M, z30.b, z0.b\n"
+      "bgt 9b\n"
+      "10:"  // K oddments: End
+      ".inst 0xa040c340  // ld1w { z0.s-z3.s }, pn8.b/Z, [x26]\n"
+      "addvl x26, x26, #4\n"
+      ".inst 0xc0912400  // addva za0.s, p1/M, p1/M, z0.s\n"
+      ".inst 0xc0912421  // addva za1.s, p1/M, p1/M, z1.s\n"
+      ".inst 0xc0912442  // addva za2.s, p1/M, p1/M, z2.s\n"
+      ".inst 0xc0912463  // addva za3.s, p1/M, p1/M, z3.s\n"
+      "tbz x15, #1, 14f\n"
+      "tbz x15, #0, 12f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "11:"  // Store to partial result buffer: Store and refill: Loop
+      ".inst 0xa040c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc0840680  // mova za0h.s[x12], { z20.s-z23.s }\n"
+      ".inst 0xc0860428  // mova { z8.s-z11.s }, za1h.s[x12]\n"
+      ".inst 0xa041c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840481  // mova za1h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc086047c  // mova { z28.s-z31.s }, za3h.s[x12]\n"
+      ".inst 0xa042c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840482  // mova za2h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa043c1d4  // ld1w { z20.s-z23.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840683  // mova za3h.s[x12], { z20.s-z23.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa060c1b0  // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+      "addvl x14, x14, #16\n"
+      ".inst 0xa061c1a8  // st1w { z8.s-z11.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      ".inst 0xa062c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1bc  // st1w { z28.s-z31.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 11b\n"
+      "b 30f\n"
+      "12:"  // Store to partial result buffer: Store only
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "13:"  // Store to partial result buffer: Store only: Loop
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc086042c  // mova { z12.s-z15.s }, za1h.s[x12]\n"
+      ".inst 0xa060c1b0  // st1w { z16.s-z19.s }, pn8.b, [x13]\n"
+      ".inst 0xc0860454  // mova { z20.s-z23.s }, za2h.s[x12]\n"
+      ".inst 0xc0860478  // mova { z24.s-z27.s }, za3h.s[x12]\n"
+      ".inst 0xa061c1ac  // st1w { z12.s-z15.s }, pn8.b, [x13, #0x4, MUL VL]\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      ".inst 0xa062c1b4  // st1w { z20.s-z23.s }, pn8.b, [x13, #0x8, MUL VL]\n"
+      ".inst 0xa063c1b8  // st1w { z24.s-z27.s }, pn8.b, [x13, #0xc, MUL VL]\n"
+      "addvl x13, x13, #16\n"
+      "blt 13b\n"
+      "b 30f\n"
+      "14:"  // Store to output array
+      "ldr x25, [%x[args], %[offsetof_C]]\n"
+      "add x25, x25, x9\n"  // C += n
+      "sub x24, x11, x10\n"
+      "ld1rw { z8.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_mul]]\n"
+      "ldr x23, [%x[args], %[offsetof_ldcb]]\n"
+      "madd x25, x10, x23, x25\n"  // C += m * ldc
+      "ld1rw { z7.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+      "ld1rw { z6.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_c_offset]]\n"
+      "ld1rw { z5.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_minval]]\n"
+      "ld1rw { z4.s }, p1/Z, [%x[rq], %[offsetof_Requantize32_maxval]]\n"
+      "tbz x15, #2, 15f\n"
+      "ldr w20, [%x[args], %[offsetof_n_0]]\n"
+      "add x20, x20, x9\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_muls]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      "ld1w { z8.s }, p0/Z, [x19]\n"
+      "ldr x19, [%x[rq], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+      "add x19, x19, x20, LSL #2\n"
+      "ld1w { z7.s }, p0/Z, [x19]\n"
+      "15:"  // Store to output array: Load per-channel parameters: End
+      "cntw x22\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 17f\n"
+      "16:"  // Store to output array: Accumulator row 0 loop
+      ".inst 0xc086040c  // mova { z12.s-z15.s }, za0h.s[x12]\n"
+      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+      "st1b { z12.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z13.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z14.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z15.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 16b\n"
+      "17:"  // Store to output array: Accumulator row 0 oddments
+      "cbz x19, 18f\n"
+      ".inst 0xc0860410  // mova { z16.s-z19.s }, za0h.s[x12]\n"
+      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+      "st1b { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 18f\n"
+      "st1b { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "18:"  // Store to output array: Accumulator row 0 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 20f\n"
+      "19:"  // Store to output array: Accumulator row 1 loop
+      ".inst 0xc0860430  // mova { z16.s-z19.s }, za1h.s[x12]\n"
+      ".inst 0xc1a8ac10  // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa30  // srshl { z16.s-z19.s }, { z16.s-z19.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab10  // add { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+      ".inst 0xc1a4ccb0  // sclamp { z16.s-z19.s }, z5.s, z4.s\n"
+      "st1b { z16.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z17.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z18.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z19.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 19b\n"
+      "20:"  // Store to output array: Accumulator row 1 oddments
+      "cbz x19, 21f\n"
+      ".inst 0xc086043c  // mova { z28.s-z31.s }, za1h.s[x12]\n"
+      ".inst 0xc1a8ac1c  // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa3c  // srshl { z28.s-z31.s }, { z28.s-z31.s }, z7.s\n"
+      ".inst 0xc1a6ab1c  // add { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+      ".inst 0xc1a4ccbc  // sclamp { z28.s-z31.s }, z5.s, z4.s\n"
+      "st1b { z28.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z29.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 21f\n"
+      "st1b { z30.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "21:"  // Store to output array: Accumulator row 1 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x21, x24, x22, LT\n"
+      "lsr x20, x21, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x21, #0x3\n"
+      "cbz x20, 23f\n"
+      "22:"  // Store to output array: Accumulator row 2 loop
+      ".inst 0xc0860458  // mova { z24.s-z27.s }, za2h.s[x12]\n"
+      ".inst 0xc1a8ac18  // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa38  // srshl { z24.s-z27.s }, { z24.s-z27.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab18  // add { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+      ".inst 0xc1a4ccb8  // sclamp { z24.s-z27.s }, z5.s, z4.s\n"
+      "st1b { z24.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z25.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z26.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z27.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 22b\n"
+      "23:"  // Store to output array: Accumulator row 2 oddments
+      "cbz x19, 24f\n"
+      ".inst 0xc086044c  // mova { z12.s-z15.s }, za2h.s[x12]\n"
+      ".inst 0xc1a8ac0c  // sqdmulh { z12.s-z15.s }, { z12.s-z15.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa2c  // srshl { z12.s-z15.s }, { z12.s-z15.s }, z7.s\n"
+      ".inst 0xc1a6ab0c  // add { z12.s-z15.s }, { z12.s-z15.s }, z6.s\n"
+      ".inst 0xc1a4ccac  // sclamp { z12.s-z15.s }, z5.s, z4.s\n"
+      "st1b { z12.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z13.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 24f\n"
+      "st1b { z14.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "24:"  // Store to output array: Accumulator row 2 oddments: End
+      "subs x24, x24, x21\n"
+      "beq 28f\n"
+      "whilelt p0.s, x9, x28\n"
+      "cmp x24, x22\n"
+      "csel x19, x24, x22, LT\n"
+      "lsr x20, x19, #0x2\n"
+      "mov x12, #0x0\n"
+      "and x19, x19, #0x3\n"
+      "cbz x20, 26f\n"
+      "25:"  // Store to output array: Accumulator row 3 loop
+      ".inst 0xc0860474  // mova { z20.s-z23.s }, za3h.s[x12]\n"
+      ".inst 0xc1a8ac14  // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z8.s\n"
+      "add x12, x12, #0x4\n"
+      ".inst 0xc1a7aa34  // srshl { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+      "cmp x12, x20, LSL #2\n"
+      ".inst 0xc1a6ab14  // add { z20.s-z23.s }, { z20.s-z23.s }, z6.s\n"
+      ".inst 0xc1a4ccb4  // sclamp { z20.s-z23.s }, z5.s, z4.s\n"
+      "st1b { z20.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z21.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z22.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "st1b { z23.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "blt 25b\n"
+      "26:"  // Store to output array: Accumulator row 3 oddments
+      "cbz x19, 27f\n"
+      ".inst 0xc0860460  // mova { z0.s-z3.s }, za3h.s[x12]\n"
+      ".inst 0xc1a8ac00  // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z8.s\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0xc1a7aa20  // srshl { z0.s-z3.s }, { z0.s-z3.s }, z7.s\n"
+      ".inst 0xc1a6ab00  // add { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+      ".inst 0xc1a4cca0  // sclamp { z0.s-z3.s }, z5.s, z4.s\n"
+      "st1b { z0.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      "subs x19, x19, #0x1\n"
+      "st1b { z1.s }, p0, [x25]\n"
+      "add x25, x25, x23\n"
+      "beq 27f\n"
+      "st1b { z2.s }, p0, [x25]\n"
+      "27:"  // Store to output array: Accumulator row 3 oddments: End
+      "28:"  // Store to output array: End
+      "tbz x15, #0, 30f\n"
+      "mov x12, #0x0\n"
+      "cntw x19\n"
+      "29:"  // Store to output array: Refill accumulators: Loop
+      ".inst 0xa040c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+      ".inst 0xc0840480  // mova za0h.s[x12], { z4.s-z7.s }\n"
+      ".inst 0xa041c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x4, MUL VL]\n"
+      ".inst 0xc0840601  // mova za1h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa042c1d0  // ld1w { z16.s-z19.s }, pn8.b/Z, [x14, #0x8, MUL VL]\n"
+      ".inst 0xc0840602  // mova za2h.s[x12], { z16.s-z19.s }\n"
+      ".inst 0xa043c1c4  // ld1w { z4.s-z7.s }, pn8.b/Z, [x14, #0xc, MUL VL]\n"
+      ".inst 0xc0840483  // mova za3h.s[x12], { z4.s-z7.s }\n"
+      "add x12, x12, #0x4\n"
+      "cmp x12, x19\n"
+      "addvl x14, x14, #16\n"
+      "blt 29b\n"
+      "30:"  // End block
+      "incw x9\n"
+      "cmp x9, x28\n"
+      "blt 3b\n"
+      "incw x10, ALL, MUL #4\n"
+      "cmp x10, x11\n"
+      "mov x9, #0x0\n"
+      "mov x27, x26\n"
+      "blt 3b\n"
+      ".inst 0xd503467f  // SMSTOP\n"
+      :
+      : [args] "r" (&args), [offsetof_A] "I" (offsetof(KernelArgs, A)), [offsetof_B] "I" (offsetof(KernelArgs, B)), [offsetof_C] "I" (offsetof(KernelArgs, C)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_M] "I" (offsetof(KernelArgs, M)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_Requantize32_c_offset] "I" (offsetof(Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [offsetof_accumulator_buffer] "I" (offsetof(KernelArgs, accumulator_buffer)), [offsetof_bias] "I" (offsetof(KernelArgs, bias)), [offsetof_flags] "I" (offsetof(KernelArgs, flags)), [offsetof_kstride_bytes] "I" (offsetof(KernelArgs, kstride_bytes)), [offsetof_ldcb] "I" (offsetof(KernelArgs, ldcb)), [offsetof_n_0] "I" (offsetof(KernelArgs, n_0)), [rq] "r" (&rq)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+}  // namespace arm_gemm
+
+#endif  // ARM_COMPUTE_ENABLE_SME2
+#endif  // __ARM_FEATURE_SVE
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2022-06-01 11:47:14 +0100
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2022-11-28 16:57:42 +0000
commit	03b2971ac69a86f10a1566938d1a25afee15746c (patch)
tree	aec7cfc047e1da278b4b71a706cda7b1b0faa158 /src/core/NEON/kernels/arm_gemm/kernels
parent	7dc0234331f2150a6b4ac5c2b49de419870f7cf5 (diff)
download	ComputeLibrary-03b2971ac69a86f10a1566938d1a25afee15746c.tar.gz